ddr-filetools 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cd1c3088ed857301a2108f8921c4784538374cc4
4
+ data.tar.gz: 4004b38451534990699d74f034729cab559f7cd2
5
+ SHA512:
6
+ metadata.gz: ef1570db43cc860bdc3abf6e739f2eabdfece85fc6f5fdbce36f15cdf145eb857303743a527b00422f381558eb9b9104d53264485443a51e2480577b0ef53645
7
+ data.tar.gz: d79b1d0f67a1af8c769a45bdba0b69b603826703d5d89962e9010327c0beae03176efe108f04e8df86f139e9c270b96f719c918228517c65d795adcfae25b7d1
@@ -0,0 +1,6 @@
1
+ Gemfile.lock
2
+ coverage
3
+ bin/tika-*
4
+ bin/fits
5
+ pkg
6
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --warnings
3
+ --require spec_helper
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1
4
+ before_script: "bundle exec rake tika:download fits:download"
5
+ cache:
6
+ - bundler
7
+ notifications:
8
+ email:
9
+ - lib-drs@duke.edu
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem "coveralls", require: false
@@ -0,0 +1,12 @@
1
+ Copyright (c) 2014, Duke University
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5
+
6
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7
+
8
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9
+
10
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11
+
12
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,44 @@
1
+ # Ddr::FileTools
2
+
3
+ File extraction and analysis tools.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ddr-filetools'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ddr-filetools
18
+
19
+ ## Dependencies
20
+
21
+ TODO
22
+
23
+ ## Configuration
24
+
25
+ TODO
26
+
27
+ There are rake tasks for downloading Tika and FITS to expected locations.
28
+
29
+ ```sh
30
+ rake tika:download
31
+ rake fits:download
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ TODO
37
+
38
+ ## Contributing
39
+
40
+ 1. Fork it ( https://github.com/[my-github-username]/ddr_extractor/fork )
41
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
42
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
43
+ 4. Push to the branch (`git push origin my-new-feature`)
44
+ 5. Create a new Pull Request
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ load "tasks/ddr_filetools.rake"
5
+
6
+ desc "Run all specs in spec directory"
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task :default => :spec
File without changes
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ddr/filetools/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ddr-filetools"
8
+ spec.version = Ddr::FileTools::VERSION
9
+ spec.authors = ["David Chandek-Stark"]
10
+ spec.email = ["dchandekstark@gmail.com"]
11
+ spec.summary = "Pluggable text and metadata extraction service."
12
+ spec.description = "Pluggable text and metadata extraction service."
13
+ spec.homepage = "https://github.com/duke-libraries/ddr-filetools"
14
+ spec.license = "BSD-3-Clause"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec", "~> 3.0"
24
+ end
@@ -0,0 +1 @@
1
+ require "ddr/extraction"
@@ -0,0 +1,29 @@
1
+ require_relative "filetools/version"
2
+ require_relative "filetools/client"
3
+ require_relative "filetools/tool"
4
+ require_relative "filetools/tika"
5
+ require_relative "filetools/fits"
6
+ require_relative "filetools/text_command"
7
+ require_relative "filetools/metadata_command"
8
+ require_relative "filetools/ocr_command"
9
+
10
+ module Ddr
11
+ module FileTools
12
+
13
+ class << self
14
+ def tools
15
+ @tools ||= {}
16
+ end
17
+
18
+ def register(tool_name, opts)
19
+ tools[tool_name] = Tool.new(opts)
20
+ end
21
+ end
22
+
23
+ register :text, command: TextCommand, provider: Tika
24
+ register :metadata, command: MetadataCommand, provider: Fits
25
+ register :ocr, command: OcrCommand, provider: Tika
26
+
27
+ end
28
+ end
29
+
@@ -0,0 +1,12 @@
1
+ module Ddr
2
+ module FileTools
3
+ class Client
4
+
5
+ def run_tool(tool_name, *args)
6
+ tool = Ddr::FileTools.tools.fetch(tool_name)
7
+ tool.call(*args)
8
+ end
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,17 @@
1
+ module Ddr
2
+ module FileTools
3
+ class Command
4
+
5
+ attr_reader :provider
6
+
7
+ def initialize(provider)
8
+ @provider = provider
9
+ end
10
+
11
+ def call(file_path)
12
+ raise NotImplementedError, "Subclasses must implement `call`."
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ require "ddr-extraction"
2
+
3
+ bin_dir = File.expand_path("../../../../bin", __FILE__)
4
+
5
+ Ddr::Extraction.configure do |config|
6
+ config.adapters.default = :tika
7
+ config.adapters.tika.path = File.join(bin_dir, "tika-app.jar")
8
+ config.adapters.fits.path = File.join(bin_dir, "fits", "fits.sh")
9
+ end
10
+
@@ -0,0 +1,27 @@
1
+ require_relative "provider"
2
+
3
+ module Ddr
4
+ module FileTools
5
+ class Fits < Provider
6
+
7
+ class << self
8
+ # Path to FITS executable (fits.sh or fits.bat)
9
+ attr_accessor :fits_path
10
+ end
11
+
12
+ self.fits_path = File.join(File.expand_path("../../../../bin", __FILE__), "fits", "fits.sh")
13
+
14
+ def metadata(file_path)
15
+ call command(file_path)
16
+ end
17
+
18
+ private
19
+
20
+ def command(file_path)
21
+ [self.class.fits_path, "-i", file_path]
22
+ end
23
+
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,15 @@
1
+ require_relative "command"
2
+
3
+ module Ddr
4
+ module FileTools
5
+ class MetadataCommand < Command
6
+
7
+ def call(file_path)
8
+ provider.metadata(file_path)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,15 @@
1
+ require_relative "command"
2
+
3
+ module Ddr
4
+ module FileTools
5
+ class OcrCommand < Command
6
+
7
+ def call(file_path)
8
+ provider.ocr(file_path)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,15 @@
1
+ require 'open3'
2
+ require_relative "result"
3
+
4
+ module Ddr
5
+ module FileTools
6
+ class Provider
7
+
8
+ def call(cmd)
9
+ out, err, s = Open3.capture3(*cmd)
10
+ Result.new(out, err, s)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,33 @@
1
+ module Ddr
2
+ module FileTools
3
+ class Result
4
+
5
+ attr_reader :output, :error, :status
6
+
7
+ def initialize(out, err, s)
8
+ @output, @error, @status = out, err, s
9
+ end
10
+
11
+ def inspect
12
+ "#<Ddr::FileTools::Result #{success? ? 'SUCCESS' : 'ERROR'}>"
13
+ end
14
+
15
+ def to_s
16
+ output
17
+ end
18
+
19
+ def read
20
+ output
21
+ end
22
+
23
+ def success?
24
+ status.success?
25
+ end
26
+
27
+ def error?
28
+ !success?
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ require_relative "command"
2
+
3
+ module Ddr
4
+ module FileTools
5
+ class TextCommand < Command
6
+
7
+ def call(file_path)
8
+ provider.text(file_path)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,30 @@
1
+ require_relative "provider"
2
+
3
+ module Ddr
4
+ module FileTools
5
+ class Tika < Provider
6
+
7
+ class << self
8
+ # Path to tika-app.jar
9
+ attr_accessor :tika_path
10
+ end
11
+
12
+ self.tika_path = File.join(File.expand_path("../../../../bin", __FILE__), "tika-app.jar")
13
+
14
+ def text(file_path)
15
+ call command(file_path, "--text")
16
+ end
17
+
18
+ def metadata(file_path)
19
+ call command(file_path, "--metadata", "--xml")
20
+ end
21
+
22
+ private
23
+
24
+ def command(file_path, *options)
25
+ ["java", "-jar", self.class.tika_path, options, file_path].flatten
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,24 @@
1
+ module Ddr
2
+ module FileTools
3
+ class Tool
4
+
5
+ attr_reader :command, :provider
6
+
7
+ def initialize(opts)
8
+ @command = opts.fetch(:command)
9
+ @provider = opts.fetch(:provider)
10
+ end
11
+
12
+ def call(*args)
13
+ build_command.call(*args)
14
+ end
15
+
16
+ private
17
+
18
+ def build_command
19
+ command.new(provider.new)
20
+ end
21
+
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,5 @@
1
+ module Ddr
2
+ module FileTools
3
+ VERSION = "0.4.0"
4
+ end
5
+ end
@@ -0,0 +1,56 @@
1
+ require "openssl"
2
+ require "net/http"
3
+
4
+ DOWNLOAD_DIR = File.absolute_path("tmp")
5
+ BIN_DIR = File.absolute_path("bin")
6
+ TIKA_VERSION = "1.7"
7
+ FITS_VERSION = "0.8.3"
8
+
9
+ tika_version = ENV["TIKA_VERSION"] || TIKA_VERSION
10
+ tika_path = File.join(BIN_DIR, "tika-app.jar")
11
+ tika_app = File.basename(tika_path)
12
+ tika_download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika_version}.jar"
13
+ tika_checksum_url = "#{tika_download_url}.sha"
14
+ tika_checksum_type = :SHA1
15
+
16
+ fits_version = ENV["FITS_VERSION"] || FITS_VERSION
17
+ fits_path = File.join(BIN_DIR, "fits", "fits.sh")
18
+ fits_download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits_version}.zip"
19
+
20
+ namespace :tika do
21
+ desc "Download Tika app"
22
+ task :download => [:download_dir] do
23
+ FileUtils.cd(DOWNLOAD_DIR) do
24
+ puts "Downloading Tika app ... "
25
+ system "curl -L #{tika_download_url} -o #{tika_app}"
26
+ checksum = Net::HTTP.get(URI(tika_checksum_url)).chomp
27
+ puts "Verifiying checksum ... "
28
+ digest = OpenSSL::Digest.const_get(tika_checksum_type).new
29
+ digest << File.read(tika_app)
30
+ if digest.to_s != checksum
31
+ puts "Checksums do not match -- aborting!"
32
+ FileUtils.remove_entry_secure(tika_app)
33
+ abort
34
+ end
35
+ FileUtils.mv(tika_app, tika_path)
36
+ end
37
+ end
38
+ end
39
+
40
+ namespace :fits do
41
+ desc "Download FITS tool"
42
+ task :download => :download_dir do
43
+ FileUtils.cd(DOWNLOAD_DIR) do
44
+ puts "Downloading FITS tool ... "
45
+ system "curl -L #{fits_download_url} -o fits.zip"
46
+ system "unzip -a -o -q fits.zip"
47
+ FileUtils.mv("fits-#{fits_version}", File.dirname(fits_path))
48
+ end
49
+ FileUtils.chmod(0755, fits_path)
50
+ end
51
+ end
52
+
53
+ task :download_dir do
54
+ FileUtils.mkdir(DOWNLOAD_DIR) unless Dir.exists?(DOWNLOAD_DIR)
55
+ end
56
+
Binary file
Binary file
@@ -0,0 +1,83 @@
1
+ require "coveralls"
2
+ Coveralls.wear!
3
+
4
+ require "ddr/filetools"
5
+
6
+ # This file was generated by the `rspec --init` command. Conventionally, all
7
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
+ # The generated `.rspec` file contains `--require spec_helper` which will cause this
9
+ # file to always be loaded, without a need to explicitly require it in any files.
10
+ #
11
+ # Given that it is always loaded, you are encouraged to keep this file as
12
+ # light-weight as possible. Requiring heavyweight dependencies from this file
13
+ # will add to the boot time of your test suite on EVERY test run, even for an
14
+ # individual file that may not need all of that loaded. Instead, make a
15
+ # separate helper file that requires this one and then use it only in the specs
16
+ # that actually need it.
17
+ #
18
+ # The `.rspec` file also contains a few flags that are not defaults but that
19
+ # users commonly want.
20
+ #
21
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
22
+ RSpec.configure do |config|
23
+ # The settings below are suggested to provide a good initial experience
24
+ # with RSpec, but feel free to customize to your heart's content.
25
+
26
+ # These two settings work together to allow you to limit a spec run
27
+ # to individual examples or groups you care about by tagging them with
28
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
29
+ # get run.
30
+ config.filter_run :focus
31
+ config.run_all_when_everything_filtered = true
32
+
33
+ # Many RSpec users commonly either run the entire suite or an individual
34
+ # file, and it's useful to allow more verbose output when running an
35
+ # individual spec file.
36
+ if config.files_to_run.one?
37
+ # Use the documentation formatter for detailed output,
38
+ # unless a formatter has already been configured
39
+ # (e.g. via a command-line flag).
40
+ config.default_formatter = 'doc'
41
+ end
42
+
43
+ # Print the 10 slowest examples and example groups at the
44
+ # end of the spec run, to help surface which specs are running
45
+ # particularly slow.
46
+ config.profile_examples = 10
47
+
48
+ # Run specs in random order to surface order dependencies. If you find an
49
+ # order dependency and want to debug it, you can fix the order by providing
50
+ # the seed, which is printed after each run.
51
+ # --seed 1234
52
+ config.order = :random
53
+
54
+ # Seed global randomization in this process using the `--seed` CLI option.
55
+ # Setting this allows you to use `--seed` to deterministically reproduce
56
+ # test failures related to randomization by passing the same `--seed` value
57
+ # as the one that triggered the failure.
58
+ Kernel.srand config.seed
59
+
60
+ # rspec-expectations config goes here. You can use an alternate
61
+ # assertion/expectation library such as wrong or the stdlib/minitest
62
+ # assertions if you prefer.
63
+ config.expect_with :rspec do |expectations|
64
+ # Enable only the newer, non-monkey-patching expect syntax.
65
+ # For more details, see:
66
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
67
+ expectations.syntax = :expect
68
+ end
69
+
70
+ # rspec-mocks config goes here. You can use an alternate test double
71
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
72
+ config.mock_with :rspec do |mocks|
73
+ # Enable only the newer, non-monkey-patching expect syntax.
74
+ # For more details, see:
75
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
76
+ mocks.syntax = :expect
77
+
78
+ # Prevents you from mocking or stubbing a method that does not exist on
79
+ # a real object. This is generally recommended.
80
+ mocks.verify_partial_doubles = true
81
+ end
82
+
83
+ end
@@ -0,0 +1,21 @@
1
+ module Ddr
2
+ module FileTools
3
+ RSpec.describe Client do
4
+
5
+ describe "extracting text" do
6
+ let(:file_path) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
7
+ it "should extract the text content of the file" do
8
+ expect(subject.run_tool(:text, file_path).output).to match(/This is a sample document./)
9
+ end
10
+ end
11
+
12
+ describe "extracting metadata" do
13
+ let(:file_path) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
+ it "should extract technical metadata from the file" do
15
+ expect(subject.run_tool(:metadata, file_path).output.length).to_not eq(0)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,121 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ddr-filetools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - David Chandek-Stark
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: Pluggable text and metadata extraction service.
56
+ email:
57
+ - dchandekstark@gmail.com
58
+ executables:
59
+ - ".keep"
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - ".rspec"
65
+ - ".travis.yml"
66
+ - Gemfile
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - bin/.keep
71
+ - ddr-filetools.gemspec
72
+ - lib/ddr-extraction.rb
73
+ - lib/ddr/filetools.rb
74
+ - lib/ddr/filetools/client.rb
75
+ - lib/ddr/filetools/command.rb
76
+ - lib/ddr/filetools/defaults.rb
77
+ - lib/ddr/filetools/fits.rb
78
+ - lib/ddr/filetools/metadata_command.rb
79
+ - lib/ddr/filetools/ocr_command.rb
80
+ - lib/ddr/filetools/provider.rb
81
+ - lib/ddr/filetools/result.rb
82
+ - lib/ddr/filetools/text_command.rb
83
+ - lib/ddr/filetools/tika.rb
84
+ - lib/ddr/filetools/tool.rb
85
+ - lib/ddr/filetools/version.rb
86
+ - lib/tasks/ddr_filetools.rake
87
+ - spec/fixtures/blue-devil.png
88
+ - spec/fixtures/sample.docx
89
+ - spec/fixtures/sample.pdf
90
+ - spec/spec_helper.rb
91
+ - spec/unit/client_spec.rb
92
+ homepage: https://github.com/duke-libraries/ddr-filetools
93
+ licenses:
94
+ - BSD-3-Clause
95
+ metadata: {}
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 2.2.2
113
+ signing_key:
114
+ specification_version: 4
115
+ summary: Pluggable text and metadata extraction service.
116
+ test_files:
117
+ - spec/fixtures/blue-devil.png
118
+ - spec/fixtures/sample.docx
119
+ - spec/fixtures/sample.pdf
120
+ - spec/spec_helper.rb
121
+ - spec/unit/client_spec.rb