rika 1.6.0-java → 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +6 -4
  3. data/.rubocop.yml +49 -0
  4. data/Gemfile +12 -0
  5. data/README.md +226 -47
  6. data/RELEASE_NOTES.md +43 -0
  7. data/Rakefile +4 -7
  8. data/bin/rika +13 -0
  9. data/lib/rika/cli/args_parser.rb +131 -0
  10. data/lib/rika/cli/rika_command.rb +129 -0
  11. data/lib/rika/formatters.rb +39 -0
  12. data/lib/rika/parse_result.rb +34 -0
  13. data/lib/rika/parser.rb +84 -0
  14. data/lib/rika/tika_loader.rb +65 -0
  15. data/lib/rika/version.rb +3 -1
  16. data/lib/rika.rb +96 -104
  17. data/pom.xml +2 -2
  18. data/rika.gemspec +30 -15
  19. data/rika_helper.rb +30 -0
  20. data/spec/fixtures/de.txt +21 -1
  21. data/spec/fixtures/document.doc +0 -0
  22. data/spec/fixtures/document.docx +0 -0
  23. data/spec/fixtures/document.pdf +0 -0
  24. data/spec/fixtures/document.txt +23 -0
  25. data/spec/fixtures/en.txt +23 -1
  26. data/spec/fixtures/es.txt +21 -1
  27. data/spec/fixtures/fr.txt +23 -1
  28. data/spec/fixtures/image_jpg_without_extension +0 -0
  29. data/spec/fixtures/ru.txt +21 -1
  30. data/spec/fixtures/tiny.txt +1 -0
  31. data/spec/rika/cli/args_parser_spec.rb +117 -0
  32. data/spec/rika/cli/rika_command_spec.rb +120 -0
  33. data/spec/rika/formatters_spec.rb +23 -0
  34. data/spec/rika/parse_result_spec.rb +42 -0
  35. data/spec/rika/parser_spec.rb +304 -0
  36. data/spec/rika/rika_spec.rb +10 -0
  37. data/spec/rika/tika_loader_spec.rb +57 -0
  38. data/spec/spec_helper.rb +13 -5
  39. metadata +54 -98
  40. data/.travis.yml +0 -7
  41. data/spec/fixtures/over_100k_file.txt +0 -1241
  42. data/spec/fixtures/text_file.txt +0 -1
  43. data/spec/fixtures/text_file_without_extension +0 -1
  44. data/spec/rika_spec.rb +0 -202
  45. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  46. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  47. data/target/dependency/asm-debug-all-4.1.jar +0 -0
  48. data/target/dependency/aspectjrt-1.8.0.jar +0 -0
  49. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  50. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  51. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  52. data/target/dependency/commons-codec-1.9.jar +0 -0
  53. data/target/dependency/commons-compress-1.8.1.jar +0 -0
  54. data/target/dependency/commons-httpclient-3.1.jar +0 -0
  55. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  56. data/target/dependency/fontbox-1.8.6.jar +0 -0
  57. data/target/dependency/isoparser-1.0.2.jar +0 -0
  58. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  59. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  60. data/target/dependency/jdom-1.0.jar +0 -0
  61. data/target/dependency/jempbox-1.8.6.jar +0 -0
  62. data/target/dependency/jhighlight-1.0.jar +0 -0
  63. data/target/dependency/jmatio-1.0.jar +0 -0
  64. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  65. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  66. data/target/dependency/netcdf-4.2.20.jar +0 -0
  67. data/target/dependency/pdfbox-1.8.6.jar +0 -0
  68. data/target/dependency/poi-3.11-beta2.jar +0 -0
  69. data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
  70. data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
  71. data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
  72. data/target/dependency/rome-1.0.jar +0 -0
  73. data/target/dependency/slf4j-api-1.6.1.jar +0 -0
  74. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  75. data/target/dependency/tika-core-1.6.jar +0 -0
  76. data/target/dependency/tika-parsers-1.6.jar +0 -0
  77. data/target/dependency/unidataCommon-4.2.20.jar +0 -0
  78. data/target/dependency/vorbis-java-core-0.6.jar +0 -0
  79. data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
  80. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  81. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  82. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  83. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  84. data/target/dependency/xz-1.5.jar +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 077d41fc4797ad2074f4acdd6df554cb7221ef01
4
- data.tar.gz: 7f1a591fe8bad0e68ca8173e1d8d7690b16341db
2
+ SHA256:
3
+ metadata.gz: 70dc9cbc6d2da17d3bcff5cdeb3a905fdbcf2e2b9d24c131558566cbfc19eada
4
+ data.tar.gz: fb81f98476322d2291488b2bd2cb9c89a1544b4ee1f85ce22ab773c30ac4765c
5
5
  SHA512:
6
- metadata.gz: 07346daa2e40986329f5b19e85d12e02b9a6cb802249d9a303aa86022e6b7515f309c3b411338bfe1d82e5bb27e2c4e5de0a6cdb18221e80b8c414c5bb5deaa2
7
- data.tar.gz: c26125c52c0c0411aa8c5dff2f7f0f81e28d0d63f4b241ca6ef47d5ea841c50576c5c40ce47a072c31ec7f16a6312510ab194c6f47499847b323f4bd80c29c68
6
+ metadata.gz: 780650b18df09662b8b67e7321641c2cf36d78335c5cf1f59f5d81a23cf262dc478958ad7aab306f80deb235e9c9910332086b3ea5231888ea96c26abd2d2505
7
+ data.tar.gz: afa841ebfe9ebb1bee5ccb0550fedde43edab6e1d6b86eed671537604d64b318c0abc7e41ffee054443f6bfd4a7b3c9423926e137fba4fdd95da66775e35d695
data/.gitignore CHANGED
@@ -1,7 +1,10 @@
1
1
  *.gem
2
2
  *.rbc
3
+ .DS_Store
3
4
  .bundle
4
5
  .config
6
+ coverage/
7
+ .idea/
5
8
  .yardoc
6
9
  Gemfile.lock
7
10
  InstalledFiles
@@ -10,12 +13,11 @@ coverage
10
13
  doc/
11
14
  lib/bundler/man
12
15
  pkg
16
+ projectFilesBackup/
13
17
  rdoc
14
18
  spec/reports
19
+ spec/rspec-failed-tests-control-file.txt
20
+ target/
15
21
  test/tmp
16
22
  test/version_tmp
17
23
  tmp
18
-
19
- .DS_Store
20
- projectFilesBackup/
21
- .idea/
data/.rubocop.yml ADDED
@@ -0,0 +1,49 @@
1
+ require: rubocop-rspec
2
+ AllCops:
3
+ NewCops: enable
4
+ Include:
5
+ - '**/*.rb'
6
+ - '*.gemspec'
7
+ - '**/Rakefile'
8
+ - '**/Gemfile'
9
+ - 'bin/rika'
10
+ Gemspec/RequiredRubyVersion:
11
+ Enabled: false
12
+ Layout/HashAlignment:
13
+ Enabled: false
14
+ Metrics/AbcSize:
15
+ Enabled: false
16
+ Metrics/BlockLength:
17
+ Enabled: false
18
+ Metrics/MethodLength:
19
+ Enabled: false
20
+ RSpec/ExampleLength:
21
+ Enabled: false
22
+ RSpec/ExpectOutput:
23
+ Enabled: false
24
+ RSpec/InstanceVariable:
25
+ Enabled: false
26
+ RSpec/MultipleExpectations:
27
+ Enabled: false
28
+ RSpec/MultipleMemoizedHelpers:
29
+ Enabled: false
30
+ Style/AccessModifierDeclarations:
31
+ Enabled: false
32
+ Style/FetchEnvVar:
33
+ Enabled: false
34
+ Style/GuardClause:
35
+ Enabled: false
36
+ Style/IfUnlessModifier:
37
+ Enabled: false
38
+ Style/Lambda:
39
+ Enabled: false
40
+ Style/LambdaCall:
41
+ Enabled: false
42
+ Style/NumericLiterals:
43
+ Enabled: false
44
+ Style/PercentLiteralDelimiters:
45
+ Enabled: false
46
+ Style/StderrPuts:
47
+ Enabled: false
48
+ Style/TrailingUnderscoreVariable:
49
+ Enabled: false
data/Gemfile CHANGED
@@ -1,4 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  # Specify your gem's dependencies in rika.gemspec
4
6
  gemspec
7
+
8
+ group :development do
9
+ gem 'pry'
10
+ gem 'rake', '~> 13.0'
11
+ gem 'rspec', '~> 3.9'
12
+ gem 'rubocop'
13
+ gem 'rubocop-rspec'
14
+ gem 'simplecov', require: false
15
+ gem 'webrick', '~> 1.6'
16
+ end
data/README.md CHANGED
@@ -1,83 +1,262 @@
1
1
  # Rika
2
2
 
3
- A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
3
+ [Rika](https://github.com/keithrbennett/rika) is a [JRuby](https://www.jruby.org) wrapper for
4
+ the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources
5
+ of [many different formats](https://tika.apache.org/1.24.1/formats.html).
4
6
 
5
- More information about Apache Tika can be found here: http://tika.apache.org/
7
+ Rika can be used as a library in your Ruby code, or on the command line.
6
8
 
7
- [![Code Climate](https://codeclimate.com/github/ricn/rika.png)](https://codeclimate.com/github/ricn/rika)
8
- [![Build Status](https://travis-ci.org/ricn/rika.png?branch=master)](https://travis-ci.org/ricn/rika)
9
+ For class and method level documentation, please use [YARD](https://rubydoc.info/gems/yard).
10
+ You can `gem install yard`, then run `yard doc` from the project root,
11
+ and then open the `doc/index.html` file in a browser.
9
12
 
10
- ## Installation
11
13
 
12
- Add this line to your application's Gemfile:
14
+ ### Requirements
15
+
16
+ * This gem only works with [JRuby](https://www.jruby.org).
17
+ * The [Apache Tika](http://tika.apache.org/) jar file must be installed on your system.
18
+ See the [Installation](#installation) section below for more information.
19
+
20
+ Rika currently supports some basic and commonly used functions of Tika.
21
+ Since it runs on JRuby, the Tika library's Java methods can be called directly from Ruby code
22
+ for more advanced needs.
23
+ See the [Other Tika Resources](#other-tika-resources) section of this document for alternatives to
24
+ Rika that may suit more demanding needs.
13
25
 
14
- gem 'rika'
26
+ Rika can be used either as a gem in your own Ruby project, or on the command line using the provided executable.
15
27
 
16
- Remember that this gem only works on JRuby.
28
+ ## Usage in Your Ruby Code
17
29
 
18
- And then execute:
30
+ > [!IMPORTANT]
31
+ > **It is necessary to call `Rika.init` before using Rika.** This is because the loading of the Tika library
32
+ has been put in an init method, rather than at load time, so that 'jar file not found or specified' errors
33
+ do not prevent your application from loading. If you forget to call `Rika.init`, you may see seemingly unrelated
34
+ error messages.
19
35
 
20
- $ bundle
36
+ As a convenience, the `Rika.init` method is called automatically when you call the Rika module methods. However,
37
+ if you access other Rika classes and methods, `init` may not have been called yet, so you should call it yourself.
21
38
 
22
- Or install it yourself as:
39
+ ----
23
40
 
24
- $ gem install rika
41
+ The Rika `parse` method returns a `Rika::ParseResult` object that contains the parsed text and
42
+ various pieces of metadata. The `ParseResult` class' main methods are:
25
43
 
26
- ## Usage
44
+ * `content` - the parsed text
45
+ * `metadata` - a hash of metadata key/value pairs
46
+ * `content_type` - the content type of the parsed data, e.g. "text/plain; charset=UTF-8"
47
+ * `language` - the language of the parsed data, e.g. "en"
48
+ * `data_source` - the data source, either a filespec or a URL
27
49
 
28
- For a quick start with the simplest use cases, the following functions
29
- are provided to get what you need in a single function call, for your convenience:
50
+ For example:
30
51
 
31
52
  ```ruby
32
53
  require 'rika'
33
54
 
34
- content = Rika.parse_content('document.pdf') # string containing all content text
35
- metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
36
- content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
55
+ parse_result = Rika.parse('x.pdf') # returns a Rika::ParseResult object
56
+ parse_result.content # string containing all content text
57
+ parse_result.text # 'text' is an alias for 'content'
58
+ parse_result.metadata # hash containing the document metadata
59
+ parse_result.content_type # e.g. "application/pdf"
60
+ parse_result.language # e.g. "en"
61
+ parse_result.data_source # e.g. "x.pdf"
37
62
  ```
38
63
 
39
- For other use cases and finer control, you can work directly with the Rika::Parser object:
64
+ A URL can be used instead of a filespec wherever a data source is specified:
40
65
 
41
66
  ```ruby
42
- require 'rika'
67
+ parse_result = Rika.parse('https://github.com/keithrbennett/rika')
68
+ ```
69
+
70
+ The Rika module also has the following methods:
71
+
72
+ ```ruby
73
+ Rika.language("magnifique") # => "fr"
74
+ Rika.tika_version # => "2.9.0"
75
+ ```
76
+
77
+ ## Command Line Executable Usage
78
+
79
+ Rika can also be used on the command line using the `rika` executable. For example, the simplest form is to simply
80
+ specify one or more filespecs or URL's as arguments:
81
+
82
+ ```bash
83
+ rika x.pdf https://github.com/keithrbennett/rika
84
+ ```
85
+ Here is the help text:
86
+
87
+ ```
88
+ Rika v2.0.0 (Tika v2.9.0) - https://github.com/keithrbennett/rika
89
+
90
+ Usage: rika [options] <file or url> [...file or url...]
91
+ Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
92
+ If a format contains two letters, the first will be used for metadata, the second for text.
93
+ Values for the text, metadata, and as_array boolean options may be specified as follows:
94
+ Enable: +, true, yes, [empty]
95
+ Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
96
+
97
+ -f, --format FORMAT Output format (default: at)
98
+ -m, --[no-]metadata [FLAG] Output metadata (default: true)
99
+ -t, --[no-]text [FLAG] Output text (default: true)
100
+ -k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
101
+ -s, --[no-]source [FLAG] Document source file or URL
102
+ -a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
103
+ -v, --version Output version
104
+ -h, --help Output help
105
+ ```
106
+
107
+ ### Outputting Only Metadata or Only Parsed Text
108
+
109
+ The default setting is to output both metadata and text. To disable either, use the `-m` or `-t` options
110
+ with a disabling flag, e.g. `-m-`, `-m false`, `-m no`, or `--no-metadata` to disable metadata.
111
+
112
+ ### Outputting the Document Source Identifier (Filespec or URL)
113
+
114
+ There are many times when it is useful to know the source of the document. For example, if you are processing
115
+ a large number of documents, you may want to know which document a particular piece of output came from.
116
+
117
+ The document source identifier is output by default. To disable it, use the `-s` option with a disabling flag, e.g. `-s-`,
118
+ `-s false`, `-s no`, or `--no-source`.
119
+
120
+ ### Output Formats
43
121
 
44
- parser = Rika::Parser.new('document.pdf')
122
+ The `-f` option can be used to specify the output format. The default is `at`, which means that the metadata will be
123
+ output in awesome_print format, and the text will be output using `to_s`
124
+ (i.e. without any changes to the parsed string).
45
125
 
46
- # Return the content of the document:
47
- parser.content
126
+ If a single argument to `-f` is specified, it will be used for both metadata and text. If two arguments are specified,
127
+ the first will be used for metadata and the second for the parsed text.
48
128
 
49
- # Return the media type for the document:
50
- parser.media_type
51
- => "application/pdf"
129
+ ### Sorting of Metadata Keys
52
130
 
53
- # Return the metadata field title if it exists:
54
- parser.metadata["title"] if parser.metadata_exists?("title")
131
+ By default, metadata keys will be sorted case insensitively. To disable this, use the `-k` option
132
+ with a disabling flag, i.e. `-k-`, `-k false`, `-k no`, or `--no-key-sort`.
55
133
 
56
- # Return all the available metadata keys that can be read from the document
57
- parser.available_metadata
134
+ The case insensitivity is implemented by using `String#downcase`.
135
+ This may not sort correctly on some non-English systems.
58
136
 
59
- # Return only the first 10000 chars of the content:
60
- parser = Rika::Parser.new('document.pdf', 10000)
61
- parser.content # 10000 first chars returned
137
+ ### Specifying Command Line Options in the RIKA_OPTIONS Environment Variable
62
138
 
63
- # Return content from URL
64
- parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
65
- parser.content
139
+ If you find yourself using the same options over and over again, you can put them in the `RIKA_OPTIONS` environment
140
+ variable. For example, if the default behavior of sorting keys does not work for your language, you can disable it
141
+ for all invocations of the `rika` command by specifying `-k-` in the RIKA_OPTIONS environment variable.
66
142
 
67
- # Return the language for the content
68
- parser = parser = Rika::Parser.new('german document.pdf')
69
- parser.language
70
- => "de"
143
+ ### Machine Readable Data Support
71
144
 
72
- # Check whether the langugage identification is certain enough to be trusted
73
- parser.language_is_reasonably_certain?
74
-
145
+ If both metadata and text are output, and the same output format is used for both, and that format is JSON
146
+ (plain or "pretty") or YAML, then the output per document will be a single JSON or YAML hash representation
147
+ containing both the metadata and the text (whose keys are "metadata" and "text"). This enables piping
148
+ the results of multiple documents to a file or to another program that can use it as a data source.
149
+ In addition, when processing multiple files, this streaming approach will be more efficient
150
+ than calling Rika separately for each file, since each invocation of the rika command requires starting up
151
+ a Java Virtual Machine.
152
+
153
+ If the `-a` (`--as-array`) option is specified, then the output will be an array of such hashes, one for each file.
154
+ This enables the output to be used as a data source for programs that can process an array of hashes, e.g. for analysis.
155
+
156
+ For example, here is an example of how to use Rika and [rexe](https://github.com/keithrbennett/rexe]) to get a tally
157
+ of content types for a set of documents, sorted by content type:
158
+
159
+ ```bash
160
+ $ rika -t- -s- -fy -a spec/fixtures/* | \
161
+ rexe -iy -oa -mb "map { |r| r['metadata']['Content-Type'] }.tally.sort.to_h"
162
+ {
163
+ "application/msword" => 1,
164
+ "application/octet-stream" => 1,
165
+ "application/pdf" => 1,
166
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => 1,
167
+ "image/jpeg" => 2,
168
+ "text/plain; charset=ISO-8859-1" => 1,
169
+ "text/plain; charset=UTF-8" => 6,
170
+ "text/x-matlab; charset=ISO-8859-1" => 1
171
+ }
75
172
  ```
173
+ Here is a breakdown of the above command:
174
+
175
+ * `rika`
176
+ * `-t-` suppresses the output of text
177
+ * `-s-` suppresses the output of the source identifier
178
+ * `-fy` outputs the data in YAML format.
179
+ * `-a` option causes the output to be an array of hashes, one for each file
180
+ * `rexe`
181
+ * `-iy` indicates that the input is YAML
182
+ * `-oa` indicates that the output should be done using awesome_print/amazing_print
183
+ * `-mb` indicates that all input should be ingested as a single string ("b" for "big string", as opposed to streamed)
184
+
185
+ * Ruby code passed to `rexe`
186
+ * `map` is called on the array to extract the content type from each parsed document hash
187
+ * `tally` is called on the resulting array to get the count of each content type
188
+ * `sort` is called on the hash to sort it by key (content type) and return an array of 2-element arrays
189
+ * `to_h` is called on the array of 2-element arrays to convert it back to a hash
190
+
191
+ Here is another example that prints out the 5 most common words in all the parsed text, and their counts,
192
+ as "pretty" JSON:
193
+
194
+ ```bash
195
+ $ rika -m- spec/fixtures/* | \
196
+ rexe -in -oJ -mb 'downcase \
197
+ .split \
198
+ .tally \
199
+ .sort_by { |word, count| [-count, word] }
200
+ .first(5) \
201
+ .to_h'
202
+
203
+ {
204
+ "the": 35,
205
+ "to": 30,
206
+ "woods": 25,
207
+ "i": 25,
208
+ "and": 25
209
+ }
210
+ ```
211
+
212
+ ## Installation
213
+
214
+ * Install [JRuby](https://www.jruby.org) if you don't already have it. Ruby version managers such as
215
+ [rvm](https://rvm.io/) and [rbenv](https://github.com/rbenv) can simplify this process.
216
+ * Download the [Apache Tika](http://tika.apache.org/) jar file from
217
+ http://tika.apache.org/download.html (look for the "tika-app" jar file).
218
+ Put it in a place that makes sense for your system, such as `/usr/local/lib`.
219
+ * Configure the `TIKA_JAR_FILESPEC` environment variable to point to the Tika jar file.
220
+ For example, if you are using tika-app-2.9.0.jar, and put the jar file in `/opt/jars',
221
+ then the setting of the environment variable should look like this:
222
+
223
+ ```bash
224
+ export TIKA_JAR_FILESPEC=/opt/jars/tika-app-2.9.0.jar
225
+ ```
226
+
227
+ You can put this in your `.bashrc` or `.zshrc` file to make it persistent.
228
+
229
+ * Install the gem:
230
+
231
+ ```bash
232
+ gem install rika
233
+ ```
234
+
235
+ or, if you're using [bundler](https://bundler.io/), add this to your Gemfile:
236
+
237
+ ```ruby
238
+ gem 'rika'
239
+ ```
240
+
241
+ and then run `bundle install`.
242
+ * Verify that it works by running (as an example) `rika -m https://www.github.com`.
243
+ You should see key/value pairs representing the metadata of the Github home page.
244
+
245
+ This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
246
+ without any version manager at all, but those configurations have not been tested.
247
+
248
+ ## Other Tika Resources
249
+
250
+ * The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
251
+
252
+ * Tika also provides another jar file containing a RESTful server that you can run on the command line.
253
+ You can download this server jar from http://tika.apache.org/download.html (look for the "tika-server-standard" jar
254
+ file).
255
+ See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer
256
+ for more information.
76
257
 
77
- ## Credits
78
- The following people have contributed ideas, documentation, or code to Rika:
79
- * Keith Bennett
80
- * Richard Nyström
258
+ * @chrismattman and others have provided a ["tika_python" Python library and CLI](https://github.com/chrismattmann/tika-python)
259
+ that interfaces with the Tika server.
81
260
 
82
261
  ## Contributing
83
262
 
data/RELEASE_NOTES.md ADDED
@@ -0,0 +1,43 @@
1
+ ## Release Notes
2
+
3
+ #### v2.0.0
4
+
5
+ * Add features:
6
+ * command line interface
7
+ * support for JSON, Pretty JSON, YAML, AwesomePrint, to_s, and inspect output formats
8
+ * optional array mode (previously only nonarray streaming mode).
9
+ * more persistent options can be specified in an environment variable, `RIKA_OPTIONS`.
10
+ * metadata keys can optionally be sorted alphabetically (not all languages though).
11
+ * properties added by Rika to the metadata: data-source, language
12
+ * Filespec or URL data source identifier can optionally be output with metadata and text.
13
+ * Add support for Tika 2.8.0, breaks compatibility with Tika 1.x.
14
+ * Remove tika-app-1.24.1.jar from code base and gem (but it is still in git history).
15
+ * Tika jar file is now downloaded by the user and found via environment variable `TIKA_JAR_FILESPEC`.
16
+ * New class ParseResult created to simplify result access and Parser class.
17
+ * Add `Rika.tika_version`.
18
+ * Add `webrick` dependency, needed for current versions of Ruby.
19
+ * Remove deprecated methods `Parser#available_metadata` and `Parser#metadata_exists?`.
20
+ * Move `Parser#language` to `Rika.language`.
21
+ * Remove `Parser#language_is_reasonably_certain?`, no longer supported by Tika.
22
+ * Remove obsolete `LanguageIdentifier` import. Otherwise updated language detection.
23
+ * Various refactorings and improvements.
24
+ * Add SimpleCov test coverage and Rubocop linting tools to project.
25
+ * Set up RSpec configuration to enable --only-failures and --next-failure options.
26
+
27
+
28
+
29
+ #### v1.11.1
30
+
31
+ * Add Apache-2.0 license to gemspec.
32
+
33
+
34
+ #### v1.11.0
35
+
36
+ * Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
37
+ * Handover of maintainer status from @ricn to @keithrbennett.
38
+ * Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
39
+ * Extract parser class to its own file.
40
+ * Various cleanup and refactoring.
41
+ * Improve README.md documentation.
42
+ * Tested successfully on Java 14.
43
+ * Move Tika jar file from /target/dependency to /java-lib.
data/Rakefile CHANGED
@@ -1,11 +1,8 @@
1
- require "bundler/gem_tasks"
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
4
6
  RSpec::Core::RakeTask.new(:spec)
5
7
 
6
- task :default => :spec
7
-
8
- desc 'Download jars'
9
- task :download_jars do
10
- system "mvn dependency:copy-dependencies"
11
- end
8
+ task default: :spec
data/bin/rika ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'rika/cli/rika_command'
5
+
6
+ begin
7
+ Rika.init
8
+ rescue Rika::TikaLoadError => e
9
+ $stderr.puts e.message
10
+ exit 1
11
+ end
12
+
13
+ RikaCommand.new.call
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Processes the array of arguments (ARGV by default) and returns the options, targets, and help string.
4
+ class ArgsParser
5
+ attr_reader :args, :options, :option_parser
6
+ private :args, :options, :option_parser
7
+
8
+ DEFAULT_OPTIONS =
9
+ {
10
+ as_array: false,
11
+ format: 'at', # AwesomePrint for metadata, to_s for text content
12
+ metadata: true,
13
+ text: true,
14
+ source: true,
15
+ key_sort: true
16
+ }.freeze
17
+
18
+ # Parses the command line arguments.
19
+ # Shorthand for ArgsParser.new.call. This call is recommended to pro tect the caller in case
20
+ # this functionality is repackaged as a Module or otherwise modified.
21
+ # @param [Array] args the command line arguments (overridable for testing, etc.)
22
+ # @return [Array<Hash,String>] [options, targets, help_string],
23
+ # or exits if help or version requested or no targets specified.
24
+ def self.call(args = ARGV)
25
+ new.call(args)
26
+ end
27
+
28
+ # Parses the command line arguments.
29
+ # @param [Array] args the command line arguments (overridable for testing, etc.)
30
+ # @return [Array<Hash,String>] [options, targets, help_string],
31
+ # or exits if help or version requested or no targets specified.
32
+ def call(args = ARGV)
33
+ @args = args
34
+ @options = DEFAULT_OPTIONS.dup
35
+ prepend_environment_args
36
+ @option_parser = create_option_parser
37
+ option_parser.parse!(args)
38
+ postprocess_format_options
39
+ targets = create_target_array
40
+ [options, targets, option_parser.help]
41
+ end
42
+
43
+ # @return [OptionParser]
44
+ private def create_option_parser
45
+ OptionParser.new do |opts|
46
+ opts.banner = <<~BANNER
47
+ Rika v#{Rika::VERSION} (Tika v#{Rika.tika_version}) - #{Rika::PROJECT_URL}
48
+
49
+ Usage: rika [options] <file or url> [...file or url...]
50
+ Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
51
+ If a format contains two letters, the first will be used for metadata, the second for text.
52
+ Values for the text, metadata, and as_array boolean options may be specified as follows:
53
+ Enable: +, true, yes, [empty]
54
+ Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
55
+
56
+ BANNER
57
+
58
+ format_message = 'Output format (default: at)'
59
+ opts.on('-f', '--format FORMAT', format_message) do |format|
60
+ options[:format] = format
61
+ end
62
+
63
+ opts.on('-m', '--[no-]metadata [FLAG]', TrueClass, 'Output metadata (default: true)') do |v|
64
+ options[:metadata] = (v.nil? ? true : v)
65
+ end
66
+
67
+ opts.on('-t', '--[no-]text [FLAG]', TrueClass, 'Output text (default: true)') do |v|
68
+ options[:text] = (v.nil? ? true : v)
69
+ end
70
+
71
+ opts.on('-k', '--[no-]key-sort [FLAG]', TrueClass, 'Sort metadata keys case insensitively (default: true)') do |v|
72
+ options[:key_sort] = (v.nil? ? true : v)
73
+ end
74
+
75
+ opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Document source file or URL') do |v|
76
+ options[:source] = (v.nil? ? true : v)
77
+ end
78
+
79
+ opts.on('-a', '--[no-]as-array [FLAG]', TrueClass,
80
+ 'Output all parsed results as an array (default: false)') do |v|
81
+ options[:as_array] = (v.nil? ? true : v)
82
+ end
83
+
84
+ opts.on('-v', '--version', 'Output version') do
85
+ puts versions_string
86
+ exit
87
+ end
88
+
89
+ opts.on('-h', '--help', 'Output help') do
90
+ puts opts
91
+ exit
92
+ end
93
+ end
94
+ end
95
+
96
+ # @return [Array] the targets specified on the command line, possibly expanded by the shell,
97
+ # and with any directories removed.
98
+ private def create_target_array
99
+ targets = args.dup.reject { |arg| File.directory?(arg) }.freeze # reject dirs to handle **/* globbing
100
+ targets.map(&:freeze)
101
+ end
102
+
103
+ # Fills in the second format option character if absent, and removes any excess characters
104
+ # @return [String] format options 2-character value, e.g. 'at'
105
+ private def postprocess_format_options
106
+ # If only one format letter is specified, use it for both metadata and text.
107
+ options[:format] *= 2 if options[:format].length == 1
108
+
109
+ # Ignore and remove extra characters after the first two format characters.
110
+ options[:format] = options[:format][0..1]
111
+ end
112
+
113
+ # If the user wants to specify options in an environment variable ("RIKA_OPTIONS"),
114
+ # then this method will insert those options at the beginning of the `args` array,
115
+ # where they can be overridden by command line arguments.
116
+ private def prepend_environment_args
117
+ env_opt_string = environment_options
118
+ args_to_prepend = Shellwords.shellsplit(env_opt_string)
119
+ args.unshift(args_to_prepend).flatten!
120
+ end
121
+
122
+ # @return [String] the value of the RIKA_OPTIONS environment variable if present, else ''.
123
+ private def environment_options
124
+ ENV['RIKA_OPTIONS'] || ''
125
+ end
126
+
127
+ # @return [String] string containing versions of Rika and Tika, with labels
128
+ private def versions_string
129
+ "Versions: Rika: #{Rika::VERSION}, Tika: #{Rika.tika_version}"
130
+ end
131
+ end