tika-app 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +27 -0
- data/README.md +31 -0
- data/Rakefile +7 -0
- data/VERSION +1 -0
- data/lib/tika/app.rb +89 -0
- data/lib/tika/command.rb +45 -0
- data/lib/tika/commands.rb +57 -0
- data/lib/tika/error.rb +3 -0
- data/lib/tika/resource.rb +31 -0
- data/lib/tika/result.rb +34 -0
- data/spec/fixtures/Lorem_ipsum.docx +0 -0
- data/spec/fixtures/Lorem_ipsum.pdf +0 -0
- data/spec/fixtures/Lorem_ipsum.png +0 -0
- data/spec/fixtures/Lorem_ipsum.tiff +0 -0
- data/spec/spec_helper.rb +92 -0
- data/spec/unit/app_spec.rb +74 -0
- data/spec/unit/resource_spec.rb +43 -0
- data/tika-app.gemspec +23 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e7e13e3b027b7dfdb22b89fd6c07a8b021c3210e
|
4
|
+
data.tar.gz: 2e1021d068af535cef610eb244540a4c4f671052
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1f8ff1d0edac49772f8ecd1f8577458824014a6dfd359b96c0680f600addb1039ae1c90dd2d4605653698715bb45427abd5e89f5d9102e05844804727356ecc4
|
7
|
+
data.tar.gz: 74dedc5a7debfb4100a9db85dc1a095c0da329e01659b0a8526770ec95c52cddc62fa5d229d1a2e3a7849ac3e5e07f2f8d11f7093e62a3970fca4b75aa91c23b
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Copyright (c) Duke University.
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
5
|
+
are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
1. Redistributions of source code must retain the above copyright notice,
|
8
|
+
this list of conditions and the following disclaimer.
|
9
|
+
|
10
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
12
|
+
and/or other materials provided with the distribution.
|
13
|
+
|
14
|
+
3. Neither the name of Duke University nor the names of its contributors may
|
15
|
+
be used to endorse or promote products derived from this software without
|
16
|
+
specific prior written permission.
|
17
|
+
|
18
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
19
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
20
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
21
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
22
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
23
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
24
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
25
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
26
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
27
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Tika::App
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'tika-app'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install tika-app
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
TODO: Write usage instructions here
|
24
|
+
|
25
|
+
## Contributing
|
26
|
+
|
27
|
+
1. Fork it ( https://github.com/[my-github-username]/tika-app/fork )
|
28
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
31
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/tika/app.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require_relative "error"
|
2
|
+
require_relative "commands"
|
3
|
+
require_relative "resource"
|
4
|
+
|
5
|
+
module Tika
|
6
|
+
class App
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :path
|
10
|
+
end
|
11
|
+
|
12
|
+
include Commands
|
13
|
+
|
14
|
+
DEFAULT_PATH = File.expand_path("../../../bin/tika-app.jar", __FILE__)
|
15
|
+
|
16
|
+
attr_reader :path
|
17
|
+
attr_accessor :result
|
18
|
+
|
19
|
+
def initialize(opts={})
|
20
|
+
@path = opts[:path] || self.class.path || ENV["TIKA_APP"] || DEFAULT_PATH
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_text(file, opts={})
|
24
|
+
execute GetTextCommand, file, opts
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_metadata(file, opts={})
|
28
|
+
execute GetMetadataCommand, file, opts
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_version
|
32
|
+
execute GetVersionCommand
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_mime_types
|
36
|
+
execute GetMimeTypesCommand
|
37
|
+
end
|
38
|
+
alias_method :list_supported_types, :get_mime_types
|
39
|
+
|
40
|
+
def get_parsers
|
41
|
+
execute GetParsersCommand
|
42
|
+
end
|
43
|
+
alias_method :list_parsers, :get_parsers
|
44
|
+
|
45
|
+
def get_parsers_details
|
46
|
+
execute GetParsersDetailsCommand
|
47
|
+
end
|
48
|
+
alias_method :list_parser_details, :get_parsers_details
|
49
|
+
|
50
|
+
def get_detectors
|
51
|
+
execute GetDetectorsCommand
|
52
|
+
end
|
53
|
+
alias_method :list_detectors, :get_detectors
|
54
|
+
|
55
|
+
def detect(file, opts={})
|
56
|
+
execute DetectCommand, file, opts
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_language(file, opts={})
|
60
|
+
execute GetLanguageCommand, file, opts
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_metadata_models
|
64
|
+
execute GetMetadataModelsCommand
|
65
|
+
end
|
66
|
+
alias_method :list_met_models, :get_metadata_models
|
67
|
+
|
68
|
+
def command_line
|
69
|
+
["java", "-jar", path].freeze
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def reset
|
75
|
+
@result = nil
|
76
|
+
end
|
77
|
+
|
78
|
+
def execute(command, *args)
|
79
|
+
reset
|
80
|
+
@result = command.execute(self, *args)
|
81
|
+
if result.success?
|
82
|
+
result.render
|
83
|
+
else
|
84
|
+
raise Error, result.error
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
data/lib/tika/command.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require "open3"
|
2
|
+
require "tempfile"
|
3
|
+
require_relative "result"
|
4
|
+
|
5
|
+
module Tika
|
6
|
+
class Command
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :options, :result_class
|
10
|
+
|
11
|
+
def execute(app, *args)
|
12
|
+
new(app, *args).execute
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
attr_reader :app, :file, :options
|
17
|
+
|
18
|
+
def initialize(app, *args)
|
19
|
+
@app = app
|
20
|
+
@file = args.shift
|
21
|
+
@options = args.pop || {}
|
22
|
+
end
|
23
|
+
|
24
|
+
def result_class
|
25
|
+
self.class.result_class
|
26
|
+
end
|
27
|
+
|
28
|
+
def command_line
|
29
|
+
cmd = app.command_line + self.class.options
|
30
|
+
if options[:password]
|
31
|
+
cmd << "-p#{options[:password]}"
|
32
|
+
end
|
33
|
+
if file
|
34
|
+
cmd << file
|
35
|
+
end
|
36
|
+
cmd
|
37
|
+
end
|
38
|
+
|
39
|
+
def execute
|
40
|
+
raw_result = Open3.capture3(*command_line)
|
41
|
+
result_class.new(*raw_result)
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative "command"
|
2
|
+
|
3
|
+
module Tika
|
4
|
+
module Commands
|
5
|
+
|
6
|
+
class GetTextCommand < Command
|
7
|
+
self.options = ["-t", "-eUTF8"]
|
8
|
+
self.result_class = TextResult
|
9
|
+
end
|
10
|
+
|
11
|
+
class GetMetadataCommand < Command
|
12
|
+
self.options = ["-j"]
|
13
|
+
self.result_class = JSONResult
|
14
|
+
end
|
15
|
+
|
16
|
+
class GetVersionCommand < Command
|
17
|
+
self.options = ["-V"]
|
18
|
+
self.result_class = TextResult
|
19
|
+
end
|
20
|
+
|
21
|
+
class GetMimeTypesCommand < Command
|
22
|
+
self.options = ["--list-supported-types"]
|
23
|
+
self.result_class = TextResult
|
24
|
+
end
|
25
|
+
|
26
|
+
class GetParsersCommand < Command
|
27
|
+
self.options = ["--list-parsers"]
|
28
|
+
self.result_class = TextResult
|
29
|
+
end
|
30
|
+
|
31
|
+
class GetParsersDetailsCommand < Command
|
32
|
+
self.options = ["--list-parser-details"]
|
33
|
+
self.result_class = TextResult
|
34
|
+
end
|
35
|
+
|
36
|
+
class GetDetectorsCommand < Command
|
37
|
+
self.options = ["--list-detectors"]
|
38
|
+
self.result_class = TextResult
|
39
|
+
end
|
40
|
+
|
41
|
+
class DetectCommand < Command
|
42
|
+
self.options = ["-d"]
|
43
|
+
self.result_class = TextResult
|
44
|
+
end
|
45
|
+
|
46
|
+
class GetMetadataModelsCommand < Command
|
47
|
+
self.options = ["--list-met-models"]
|
48
|
+
self.result_class = TextResult
|
49
|
+
end
|
50
|
+
|
51
|
+
class GetLanguageCommand < Command
|
52
|
+
self.options = ["-l"]
|
53
|
+
self.result_class = TextResult
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
data/lib/tika/error.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative "app"
|
2
|
+
|
3
|
+
module Tika
|
4
|
+
class Resource
|
5
|
+
|
6
|
+
attr_reader :file, :app, :opts
|
7
|
+
|
8
|
+
def initialize(file, opts={})
|
9
|
+
@file = file
|
10
|
+
@app = App.new
|
11
|
+
@opts = opts
|
12
|
+
end
|
13
|
+
|
14
|
+
def text
|
15
|
+
@text ||= app.get_text(file, opts)
|
16
|
+
end
|
17
|
+
|
18
|
+
def metadata
|
19
|
+
@metadata ||= app.get_metadata(file, opts)
|
20
|
+
end
|
21
|
+
|
22
|
+
def content_type
|
23
|
+
@content_type ||= app.detect(file, opts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def language
|
27
|
+
@language ||= app.get_language(file, opts)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/tika/result.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require "json"
|
2
|
+
require "forwardable"
|
3
|
+
|
4
|
+
module Tika
|
5
|
+
|
6
|
+
class Result
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
attr_reader :output, :error, :status
|
10
|
+
|
11
|
+
def_delegator :status, :success?
|
12
|
+
|
13
|
+
def initialize(output, error, status)
|
14
|
+
@output, @error, @status = output, error, status
|
15
|
+
end
|
16
|
+
|
17
|
+
def render
|
18
|
+
output
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class TextResult < Result
|
23
|
+
def render
|
24
|
+
output.strip
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class JSONResult < Result
|
29
|
+
def render
|
30
|
+
JSON.load(output)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
require "tika/app"
|
2
|
+
|
3
|
+
FIXTURE_DIR = File.expand_path("../fixtures", __FILE__)
|
4
|
+
|
5
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
6
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
7
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
8
|
+
# this file to always be loaded, without a need to explicitly require it in any
|
9
|
+
# files.
|
10
|
+
#
|
11
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
12
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
13
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
14
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
15
|
+
# a separate helper file that requires the additional dependencies and performs
|
16
|
+
# the additional setup, and require it from the spec files that actually need
|
17
|
+
# it.
|
18
|
+
#
|
19
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
20
|
+
# users commonly want.
|
21
|
+
#
|
22
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
23
|
+
RSpec.configure do |config|
|
24
|
+
# rspec-expectations config goes here. You can use an alternate
|
25
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
26
|
+
# assertions if you prefer.
|
27
|
+
config.expect_with :rspec do |expectations|
|
28
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
29
|
+
# and `failure_message` of custom matchers include text for helper methods
|
30
|
+
# defined using `chain`, e.g.:
|
31
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
32
|
+
# # => "be bigger than 2 and smaller than 4"
|
33
|
+
# ...rather than:
|
34
|
+
# # => "be bigger than 2"
|
35
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
36
|
+
end
|
37
|
+
|
38
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
39
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
40
|
+
config.mock_with :rspec do |mocks|
|
41
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
42
|
+
# a real object. This is generally recommended, and will default to
|
43
|
+
# `true` in RSpec 4.
|
44
|
+
mocks.verify_partial_doubles = true
|
45
|
+
end
|
46
|
+
|
47
|
+
# These two settings work together to allow you to limit a spec run
|
48
|
+
# to individual examples or groups you care about by tagging them with
|
49
|
+
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
50
|
+
# get run.
|
51
|
+
config.filter_run :focus
|
52
|
+
config.run_all_when_everything_filtered = true
|
53
|
+
|
54
|
+
# Limits the available syntax to the non-monkey patched syntax that is
|
55
|
+
# recommended. For more details, see:
|
56
|
+
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
57
|
+
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
58
|
+
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
59
|
+
config.disable_monkey_patching!
|
60
|
+
|
61
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
62
|
+
# be too noisy due to issues in dependencies.
|
63
|
+
config.warnings = true
|
64
|
+
|
65
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
66
|
+
# file, and it's useful to allow more verbose output when running an
|
67
|
+
# individual spec file.
|
68
|
+
if config.files_to_run.one?
|
69
|
+
# Use the documentation formatter for detailed output,
|
70
|
+
# unless a formatter has already been configured
|
71
|
+
# (e.g. via a command-line flag).
|
72
|
+
config.default_formatter = 'doc'
|
73
|
+
end
|
74
|
+
|
75
|
+
# Print the 10 slowest examples and example groups at the
|
76
|
+
# end of the spec run, to help surface which specs are running
|
77
|
+
# particularly slow.
|
78
|
+
config.profile_examples = 10
|
79
|
+
|
80
|
+
# Run specs in random order to surface order dependencies. If you find an
|
81
|
+
# order dependency and want to debug it, you can fix the order by providing
|
82
|
+
# the seed, which is printed after each run.
|
83
|
+
# --seed 1234
|
84
|
+
config.order = :random
|
85
|
+
|
86
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
87
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
88
|
+
# test failures related to randomization by passing the same `--seed` value
|
89
|
+
# as the one that triggered the failure.
|
90
|
+
Kernel.srand config.seed
|
91
|
+
|
92
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Tika
|
2
|
+
RSpec.describe App do
|
3
|
+
|
4
|
+
describe "#get_text" do
|
5
|
+
describe "with a document" do
|
6
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.docx") }
|
7
|
+
it "should return the text of the file" do
|
8
|
+
text = subject.get_text(file)
|
9
|
+
expect(text).to match(/Lorem ipsum/)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
describe "with an image" do
|
13
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.tiff") }
|
14
|
+
it "should return the text of the file" do
|
15
|
+
text = subject.get_text(file)
|
16
|
+
expect(text).to match(/^Lorem ipsum/)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#get_metadata" do
|
22
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.pdf") }
|
23
|
+
it "should return the metadata of the file" do
|
24
|
+
metadata = subject.get_metadata(file, content_type: "application/pdf")
|
25
|
+
expect(metadata["Creation-Date"]).to eq("2015-02-15T01:54:41Z")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#get_version" do
|
30
|
+
it "should return the Tika app version" do
|
31
|
+
expect(subject.get_version).to match(/^Apache Tika/)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#get_mime_types" do
|
36
|
+
it "should return the MIME Types support by the Tika app" do
|
37
|
+
expect(subject.get_mime_types).to match(/application\/pdf/)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "#get_parsers" do
|
42
|
+
it "should return the parsers available to the Tika app" do
|
43
|
+
expect(subject.get_parsers).to match(/org\.apache\.tika\.parser\.DefaultParser/)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "#get_parsers_details" do
|
48
|
+
it "should return the parsers available to the Tika app and the MIME types they support" do
|
49
|
+
expect(subject.get_parsers_details).to match(/org\.apache\.tika\.parser\.DefaultParser/)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "#get_detectors" do
|
54
|
+
it "should return the detectors available to the Tika app" do
|
55
|
+
expect(subject.get_detectors).to match(/org\.apache\.tika\.detect\.DefaultDetector/)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "#detect" do
|
60
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.png") }
|
61
|
+
it "should return the MIME type of the resource (if successful)" do
|
62
|
+
expect(subject.detect(file)).to eq("image/png")
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "#get_language" do
|
67
|
+
let(:file) { "http://www.archives.gov/exhibits/charters/constitution_transcript.html" }
|
68
|
+
it "should return the language code" do
|
69
|
+
expect(subject.get_language(file)).to eq("en")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Tika
|
2
|
+
RSpec.describe Resource do
|
3
|
+
|
4
|
+
subject { described_class.new(file) }
|
5
|
+
|
6
|
+
describe "#text" do
|
7
|
+
describe "with a document" do
|
8
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.docx") }
|
9
|
+
it "should return the text of the file" do
|
10
|
+
expect(subject.text).to match(/^Lorem ipsum/)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
describe "with an image" do
|
14
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.tiff") }
|
15
|
+
it "should return the text of the file" do
|
16
|
+
expect(subject.text).to match(/^Lorem ipsum/)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#metadata" do
|
22
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.pdf") }
|
23
|
+
it "should return the metadata of the file" do
|
24
|
+
expect(subject.metadata["Creation-Date"]).to eq("2015-02-15T01:54:41Z")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "#content_type" do
|
29
|
+
let(:file) { File.join(FIXTURE_DIR, "Lorem_ipsum.png") }
|
30
|
+
it "should return the MIME type of the resource (if successful)" do
|
31
|
+
expect(subject.content_type).to eq("image/png")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#language" do
|
36
|
+
let(:file) { "http://www.archives.gov/exhibits/charters/constitution_transcript.html" }
|
37
|
+
it "should return the language code" do
|
38
|
+
expect(subject.language).to eq("en")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/tika-app.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "tika-app"
|
7
|
+
spec.version = File.read(File.expand_path("../VERSION", __FILE__)).chomp
|
8
|
+
spec.authors = ["dchandekstark"]
|
9
|
+
spec.email = ["dchandekstark@gmail.com"]
|
10
|
+
spec.summary = "Ruby Tika app bindings"
|
11
|
+
spec.description = "Ruby Tika app bindings"
|
12
|
+
spec.homepage = "https://github.com/duke-libraries/tika-app"
|
13
|
+
spec.license = "BSD-3-Clause"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
21
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
22
|
+
spec.add_development_dependency "rspec", "~> 3.1"
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tika-app
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- dchandekstark
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.1'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.1'
|
55
|
+
description: Ruby Tika app bindings
|
56
|
+
email:
|
57
|
+
- dchandekstark@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".gitignore"
|
63
|
+
- ".rspec"
|
64
|
+
- Gemfile
|
65
|
+
- LICENSE
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- VERSION
|
69
|
+
- lib/tika/app.rb
|
70
|
+
- lib/tika/command.rb
|
71
|
+
- lib/tika/commands.rb
|
72
|
+
- lib/tika/error.rb
|
73
|
+
- lib/tika/resource.rb
|
74
|
+
- lib/tika/result.rb
|
75
|
+
- spec/fixtures/Lorem_ipsum.docx
|
76
|
+
- spec/fixtures/Lorem_ipsum.pdf
|
77
|
+
- spec/fixtures/Lorem_ipsum.png
|
78
|
+
- spec/fixtures/Lorem_ipsum.tiff
|
79
|
+
- spec/spec_helper.rb
|
80
|
+
- spec/unit/app_spec.rb
|
81
|
+
- spec/unit/resource_spec.rb
|
82
|
+
- tika-app.gemspec
|
83
|
+
homepage: https://github.com/duke-libraries/tika-app
|
84
|
+
licenses:
|
85
|
+
- BSD-3-Clause
|
86
|
+
metadata: {}
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 2.2.2
|
104
|
+
signing_key:
|
105
|
+
specification_version: 4
|
106
|
+
summary: Ruby Tika app bindings
|
107
|
+
test_files:
|
108
|
+
- spec/fixtures/Lorem_ipsum.docx
|
109
|
+
- spec/fixtures/Lorem_ipsum.pdf
|
110
|
+
- spec/fixtures/Lorem_ipsum.png
|
111
|
+
- spec/fixtures/Lorem_ipsum.tiff
|
112
|
+
- spec/spec_helper.rb
|
113
|
+
- spec/unit/app_spec.rb
|
114
|
+
- spec/unit/resource_spec.rb
|