ddr-extraction 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c5403a9bee4dc10e433bca20f0a8ee11178a99dc
4
+ data.tar.gz: d04210fc66bfaa0e8e368fb82f3236c7c2510d38
5
+ SHA512:
6
+ metadata.gz: 7e300ee401d7cb26eb2cc259723b3a94009f9c2b4296dfa98870a401a67c4ba4c185fe8a8c5a85444c3db0d55c5a4916ef0d5d91cbdfa00356a904ef1cb2bdd0
7
+ data.tar.gz: 6ad6b2c009439ea8b378df9cfc6accd45c3f169b7967bf8047b3864cc7ae91caa1eefc87779aaff6128e1b15b86e2dec7a8d69b4099d2ed278c747a8bce863d4
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ Gemfile.lock
2
+ bin/tika-*
3
+ bin/fits-*
4
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --warnings
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,12 @@
1
+ Copyright (c) 2014, Duke University
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5
+
6
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7
+
8
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9
+
10
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11
+
12
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Ddr::Extractor
2
+
3
+ Generic file text and metadata extraction service.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ddr-extractor'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ddr-extractor
18
+
19
+ ## Usage
20
+
21
+ TODO
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it ( https://github.com/[my-github-username]/ddr_extractor/fork )
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ load "tasks/ddr_extraction.rake"
data/bin/.keep ADDED
File without changes
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ddr/extraction/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ddr-extraction"
8
+ spec.version = Ddr::Extraction::VERSION
9
+ spec.authors = ["David Chandek-Stark"]
10
+ spec.email = ["dchandekstark@gmail.com"]
11
+ spec.summary = "File text and metadata extraction service."
12
+ spec.description = "File text and metadata extraction service."
13
+ spec.homepage = "https://github.com/duke-libraries/ddr-extraction"
14
+ spec.license = "BSD-3-Clause"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec", "~> 3.0"
24
+ end
@@ -0,0 +1 @@
1
+ require "ddr/extraction"
@@ -0,0 +1,52 @@
1
+ require "ddr/extraction/version"
2
+ require "ddr/extraction/adapters"
3
+ require "ddr/extraction/extractor"
4
+
5
+ module Ddr
6
+ #
7
+ # Ddr::Extraction - A file text and metadata extraction service.
8
+ #
9
+ module Extraction
10
+
11
+ class << self
12
+
13
+ attr_accessor :text_adapter, :metadata_adapter
14
+
15
+ # Yields a configurable object for the named adapter.
16
+ #
17
+ # @param adapter [Symbol] the name of the adapter - e.g., `:tika`, `:fits`
18
+ def configure_adapter(adapter, &block)
19
+ yield Adapters.get_adapter(adapter)
20
+ end
21
+
22
+ def adapters
23
+ Adapters.config
24
+ end
25
+
26
+ def set_defaults
27
+ bin_dir = File.expand_path("../../../bin", __FILE__)
28
+
29
+ configure_adapter :tika do |tika|
30
+ tika.version = "1.6"
31
+ tika.path = File.join(bin_dir, "tika-app.jar")
32
+ tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika.version}.jar"
33
+ tika.checksum = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
34
+ tika.checksum_type = :SHA1
35
+ end
36
+
37
+ configure_adapter :fits do |fits|
38
+ fits.version = "0.8.3"
39
+ fits.path = File.join(bin_dir, "fits-#{fits.version}", "fits.sh")
40
+ fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits.version}.zip"
41
+ end
42
+
43
+ adapters.text = :tika
44
+ adapters.metadata = :fits
45
+ end
46
+
47
+ end
48
+
49
+ end
50
+ end
51
+
52
+ Ddr::Extraction.set_defaults
@@ -0,0 +1,25 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+
5
+ def self.get_adapter(adapter_name)
6
+ require_relative "adapters/#{adapter_name}_adapter"
7
+ class_name = "#{adapter_name.to_s.capitalize}Adapter"
8
+ const_get(class_name.to_sym, false)
9
+ end
10
+
11
+ def self.build_adapter(type)
12
+ adapter_name = config.send(type)
13
+ adapter = get_adapter(adapter_name)
14
+ adapter.new
15
+ end
16
+
17
+ Config = Struct.new(:text, :metadata)
18
+
19
+ def self.config
20
+ @@config ||= Config.new
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+ class Adapter
5
+
6
+ class << self
7
+ def config
8
+ yield self
9
+ end
10
+ end
11
+
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,30 @@
1
+ require_relative "adapter"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ module Adapters
6
+ class FitsAdapter < Adapter
7
+
8
+ # Return metadata extracted from file
9
+ #
10
+ # @param file [String] the file from which to extract metadata.
11
+ # @return [IO]
12
+ def extract_metadata(file)
13
+ IO.popen([self.class.path, "-i", file])
14
+ end
15
+
16
+ class << self
17
+ # FITS version
18
+ attr_accessor :version
19
+
20
+ # Path to FITS executable (fits.sh or fits.bat)
21
+ attr_accessor :path
22
+
23
+ # URL to download distribution
24
+ attr_accessor :download_url
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,13 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+ class MetadataExtractionAdapter
5
+
6
+ def extract_metadata(file)
7
+ NotImplemented
8
+ end
9
+
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+ class TextExtractionAdapter
5
+
6
+ def extract_text(file)
7
+ NotImplemented
8
+ end
9
+
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,42 @@
1
+ require_relative "adapter"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ module Adapters
6
+ class TikaAdapter < Adapter
7
+
8
+ # Extract text from file
9
+ #
10
+ # @param file [String] path to file from which to extract text
11
+ # @return [IO]
12
+ def extract_text(file)
13
+ IO.popen(["java", "-jar", self.class.path, "--text", file])
14
+ end
15
+
16
+ class << self
17
+ # Tika version
18
+ attr_accessor :version
19
+
20
+ # Path to tika-app.jar
21
+ attr_accessor :path
22
+
23
+ # Base command
24
+ attr_accessor :command
25
+
26
+ # URL to download distribution
27
+ attr_accessor :download_url
28
+
29
+ # Tika distribution checksum
30
+ attr_accessor :checksum
31
+
32
+ # Tika distribution checksum type
33
+ attr_accessor :checksum_type
34
+
35
+ # Tika server port (optional, required for server)
36
+ attr_accessor :port
37
+ end
38
+
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,32 @@
1
+ require_relative "adapters"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ class Extractor
6
+ extend Forwardable
7
+
8
+ def_delegator :text_adapter, :extract_text
9
+ def_delegator :metadata_adapter, :extract_metadata
10
+
11
+ # Extracts a type of content from a file
12
+ #
13
+ # @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
14
+ # @param file [String] path to file from which to extract content.
15
+ # @return
16
+ def extract(type, file)
17
+ send("extract_#{type}", file)
18
+ end
19
+
20
+ private
21
+
22
+ def text_adapter
23
+ @text_adapter ||= Adapters.build_adapter(:text)
24
+ end
25
+
26
+ def metadata_adapter
27
+ @metadata_adapter ||= Adapters.build_adapter(:metadata)
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,16 @@
1
+ require "ddr/extraction/adapters"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ class MetadataExtractor
6
+ extend Forwardable
7
+
8
+ def_delegator :@adapter, :extract_metadata
9
+
10
+ def initialize
11
+ @adapter = Ddr::Extraction::Adapters.get_metadata_extraction_adapter
12
+ end
13
+
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ module Ddr
2
+ module Extraction
3
+ VERSION = "0.1.0"
4
+ end
5
+ end
@@ -0,0 +1,48 @@
1
+ require "ddr-extraction"
2
+ require "openssl"
3
+
4
+ DOWNLOAD_DIR = File.absolute_path("tmp")
5
+
6
+ namespace :tika do
7
+ desc "Download Tika app"
8
+ task :download => :download_dir do
9
+ tika_app = File.join(DOWNLOAD_DIR, "tika-app.jar")
10
+ Ddr::Extraction::Adapters::TikaAdapter.config do |tika|
11
+ puts "Downloading Tika app ... "
12
+ system "curl -L #{tika.download_url} -o #{tika_app}"
13
+ puts "Verifiying checksum ... "
14
+ digest = OpenSSL::Digest.const_get(tika.checksum_type).new
15
+ digest << File.read(tika.path)
16
+ if digest.to_s == tika.checksum
17
+ FileUtils.mv(tika_app, tika.path)
18
+ else
19
+ puts "Checksums do not match!"
20
+ end
21
+ end
22
+ end
23
+
24
+ # namespace :server do
25
+ # desc "Start the Tika server"
26
+ # task :start => :environment do
27
+ # end
28
+ # end
29
+ end
30
+
31
+ namespace :fits do
32
+ desc "Download FITS tool"
33
+ task :download => :download_dir do
34
+ fits_tool = File.join(DOWNLOAD_DIR, "fits.zip")
35
+ Ddr::Extraction::Adapters::FitsAdapter.config do |fits|
36
+ puts "Downloading FITS tool ... "
37
+ system "curl -L #{fits.download_url} -o #{fits_tool}"
38
+ # Unzip options: convert text files, force overwrite, extra quiet
39
+ system "unzip -a -o -qq -d bin #{fits_tool}"
40
+ FileUtils.chmod(0755, fits.path)
41
+ end
42
+ end
43
+ end
44
+
45
+ task :download_dir do
46
+ FileUtils.mkdir(DOWNLOAD_DIR) unless Dir.exists?(DOWNLOAD_DIR)
47
+ end
48
+
Binary file
Binary file
Binary file
@@ -0,0 +1,80 @@
1
+ require "ddr-extraction"
2
+
3
+ # This file was generated by the `rspec --init` command. Conventionally, all
4
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
5
+ # The generated `.rspec` file contains `--require spec_helper` which will cause this
6
+ # file to always be loaded, without a need to explicitly require it in any files.
7
+ #
8
+ # Given that it is always loaded, you are encouraged to keep this file as
9
+ # light-weight as possible. Requiring heavyweight dependencies from this file
10
+ # will add to the boot time of your test suite on EVERY test run, even for an
11
+ # individual file that may not need all of that loaded. Instead, make a
12
+ # separate helper file that requires this one and then use it only in the specs
13
+ # that actually need it.
14
+ #
15
+ # The `.rspec` file also contains a few flags that are not defaults but that
16
+ # users commonly want.
17
+ #
18
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
19
+ RSpec.configure do |config|
20
+ # The settings below are suggested to provide a good initial experience
21
+ # with RSpec, but feel free to customize to your heart's content.
22
+
23
+ # These two settings work together to allow you to limit a spec run
24
+ # to individual examples or groups you care about by tagging them with
25
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
26
+ # get run.
27
+ config.filter_run :focus
28
+ config.run_all_when_everything_filtered = true
29
+
30
+ # Many RSpec users commonly either run the entire suite or an individual
31
+ # file, and it's useful to allow more verbose output when running an
32
+ # individual spec file.
33
+ if config.files_to_run.one?
34
+ # Use the documentation formatter for detailed output,
35
+ # unless a formatter has already been configured
36
+ # (e.g. via a command-line flag).
37
+ config.default_formatter = 'doc'
38
+ end
39
+
40
+ # Print the 10 slowest examples and example groups at the
41
+ # end of the spec run, to help surface which specs are running
42
+ # particularly slow.
43
+ config.profile_examples = 10
44
+
45
+ # Run specs in random order to surface order dependencies. If you find an
46
+ # order dependency and want to debug it, you can fix the order by providing
47
+ # the seed, which is printed after each run.
48
+ # --seed 1234
49
+ config.order = :random
50
+
51
+ # Seed global randomization in this process using the `--seed` CLI option.
52
+ # Setting this allows you to use `--seed` to deterministically reproduce
53
+ # test failures related to randomization by passing the same `--seed` value
54
+ # as the one that triggered the failure.
55
+ Kernel.srand config.seed
56
+
57
+ # rspec-expectations config goes here. You can use an alternate
58
+ # assertion/expectation library such as wrong or the stdlib/minitest
59
+ # assertions if you prefer.
60
+ config.expect_with :rspec do |expectations|
61
+ # Enable only the newer, non-monkey-patching expect syntax.
62
+ # For more details, see:
63
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
64
+ expectations.syntax = :expect
65
+ end
66
+
67
+ # rspec-mocks config goes here. You can use an alternate test double
68
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
69
+ config.mock_with :rspec do |mocks|
70
+ # Enable only the newer, non-monkey-patching expect syntax.
71
+ # For more details, see:
72
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
73
+ mocks.syntax = :expect
74
+
75
+ # Prevents you from mocking or stubbing a method that does not exist on
76
+ # a real object. This is generally recommended.
77
+ mocks.verify_partial_doubles = true
78
+ end
79
+
80
+ end
@@ -0,0 +1,21 @@
1
+ module Ddr
2
+ module Extraction
3
+ RSpec.describe Extractor do
4
+
5
+ describe "extracting text" do
6
+ let(:file) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
7
+ it "should extract the text content of the file" do
8
+ expect(subject.extract(:text, file).read).to match(/This is a sample document./)
9
+ end
10
+ end
11
+
12
+ describe "extracting metadata" do
13
+ let(:file) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
+ it "should extract technical metadata from the file" do
15
+ expect(subject.extract(:metadata, file).read.length).to_not eq(0)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ddr-extraction
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - David Chandek-Stark
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: File text and metadata extraction service.
56
+ email:
57
+ - dchandekstark@gmail.com
58
+ executables:
59
+ - ".keep"
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - ".rspec"
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - bin/.keep
70
+ - ddr-extraction.gemspec
71
+ - lib/ddr-extraction.rb
72
+ - lib/ddr/extraction.rb
73
+ - lib/ddr/extraction/adapters.rb
74
+ - lib/ddr/extraction/adapters/adapter.rb
75
+ - lib/ddr/extraction/adapters/fits_adapter.rb
76
+ - lib/ddr/extraction/adapters/metadata_extraction_adapter.rb
77
+ - lib/ddr/extraction/adapters/text_extraction_adapter.rb
78
+ - lib/ddr/extraction/adapters/tika_adapter.rb
79
+ - lib/ddr/extraction/extractor.rb
80
+ - lib/ddr/extraction/metadata_extractor.rb
81
+ - lib/ddr/extraction/version.rb
82
+ - lib/tasks/ddr_extraction.rake
83
+ - spec/fixtures/blue-devil.png
84
+ - spec/fixtures/sample.docx
85
+ - spec/fixtures/sample.pdf
86
+ - spec/spec_helper.rb
87
+ - spec/unit/extractor_spec.rb
88
+ homepage: https://github.com/duke-libraries/ddr-extraction
89
+ licenses:
90
+ - BSD-3-Clause
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.2.2
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: File text and metadata extraction service.
112
+ test_files:
113
+ - spec/fixtures/blue-devil.png
114
+ - spec/fixtures/sample.docx
115
+ - spec/fixtures/sample.pdf
116
+ - spec/spec_helper.rb
117
+ - spec/unit/extractor_spec.rb