ddr-extraction 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1e6c32f2c82cc25e4a8c7b65b98becae648925ad
4
- data.tar.gz: 9f8f4da7e5e4c7db8fc66dcddda8bff671642572
3
+ metadata.gz: 2975444782fb458e450a8acb9ec54a690d457241
4
+ data.tar.gz: 5d2cedbf73284f32b7c5497d6a49d56aaaa1c0ee
5
5
  SHA512:
6
- metadata.gz: 8411bd09f0cb81d7cb16827fd3098f474c21fcf407da1ec38e2936fc3fc9982e892de08cf6ed747901fce92b6aef3d8cfa22c4a6f4bb2a4269fbb659adca539c
7
- data.tar.gz: 30967c3ecb2bd79fa5895e7198cb15d29b45be5eceb876ce2a60dd94371b4387296daac635b1fea7a0f38270dd59e62fdc11ed0da2a9be087a4067a2b64cffb9
6
+ metadata.gz: f72b6e7224081f193cbf4b8977167d84f995e467346d60590c48133746e74cd29622ee8d1d6fb3dbc426ec37339ec5e7985c08d5e3755268b1a46dac02bb6f07
7
+ data.tar.gz: ec401d6d43201348ac80bf5ce13dfcfd854918ed39cb49bdc990c86fd3b682ced86db425387e481f57e9f39bdbd6e7623ee88b44d8f5674052e5e8d07cabb8d5
data/README.md CHANGED
@@ -6,7 +6,7 @@ Pluggable file text and metadata extraction service.
6
6
 
7
7
  Add this line to your application's Gemfile:
8
8
 
9
- gem 'ddr-extractor'
9
+ gem 'ddr-extraction'
10
10
 
11
11
  And then execute:
12
12
 
@@ -14,68 +14,45 @@ And then execute:
14
14
 
15
15
  Or install it yourself as:
16
16
 
17
- $ gem install ddr-extractor
17
+ $ gem install ddr-extraction
18
18
 
19
- ## Usage
19
+ ## Dependencies
20
+
21
+ The gem has no external dependencies of its own. Consult the documentation for each extraction tool used by your configuration.
22
+
23
+ ## Configuration
24
+
25
+ `Ddr::Extraction` includes default configurations for [Aapche Tika](http://tika.apache.org/) (text and metadata extraction) and [FITS](http://fitstool.org/) (metadata only). Tika is set as the default adapter when one is not specified to the builder.
20
26
 
27
+ ```ruby
28
+ require "ddr-extraction
29
+ Ddr::Extraction.load_defaults!
21
30
  ```
22
- >> extractor = Ddr::Extraction::Extractor.new
23
- => #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
24
31
 
25
- >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
26
- => #<IO:fd 11>
32
+ There are rake tasks for downloading Tika and FITS to expected locations.
33
+
34
+ ```sh
35
+ rake tika:download
36
+ rake fits:download
37
+ ```
38
+
39
+ Configuration Example
27
40
 
41
+ ```ruby
42
+ Ddr::Extraction.configure do |config|
43
+ config.adapters.default = :tika # Use Tika as the default adapter
44
+ config.adapters.tika.path = "/path/to/tika-app.jar"
45
+ config.adapters.fits.path = "/path/to/fits.sh"
46
+ end
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ```
52
+ >> extractor = Ddr::Extraction.build_extractor
53
+ >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
28
54
  >> puts text.read
29
55
  This is a sample document.
30
-
31
- >> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
32
- => #<IO:fd 12>
33
-
34
- >> puts metadata.read
35
- <?xml version="1.0" encoding="UTF-8"?>
36
- <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
37
- <identification>
38
- <identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
39
- <tool toolname="Exiftool" toolversion="9.13" />
40
- <tool toolname="Droid" toolversion="6.1.3" />
41
- <tool toolname="ffident" toolversion="0.2" />
42
- <tool toolname="Tika" toolversion="1.3" />
43
- <version toolname="Droid" toolversion="6.1.3">1.0</version>
44
- <externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
45
- </identity>
46
- </identification>
47
- <fileinfo>
48
- <lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
49
- <filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
50
- <filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
51
- <size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
52
- <md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
53
- <fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
54
- </fileinfo>
55
- <filestatus />
56
- <metadata>
57
- <image>
58
- <compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
59
- <compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
60
- <imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
61
- <imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
62
- <orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
63
- </image>
64
- </metadata>
65
- <statistics fitsExecutionTime="791">
66
- <tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
67
- <tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
68
- <tool toolname="Jhove" toolversion="1.5" executionTime="556" />
69
- <tool toolname="file utility" toolversion="5.04" executionTime="623" />
70
- <tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
71
- <tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
72
- <tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
73
- <tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
74
- <tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
75
- <tool toolname="ffident" toolversion="0.2" executionTime="369" />
76
- <tool toolname="Tika" toolversion="1.3" executionTime="356" />
77
- </statistics>
78
- </fits>
79
56
  ```
80
57
 
81
58
  ## Contributing
@@ -1,26 +1,38 @@
1
1
  require_relative "extraction/version"
2
2
  require_relative "extraction/configuration"
3
3
  require_relative "extraction/extractor"
4
+ require_relative "extraction/adapters"
4
5
 
5
6
  module Ddr
6
7
  #
7
- # Ddr::Extraction - A file text and metadata extraction service.
8
+ # Ddr::Extraction - A pluggable content extraction service.
8
9
  #
9
10
  module Extraction
10
11
 
11
12
  class << self
12
13
 
14
+ # Returns the service configuration
13
15
  def config
14
16
  @config ||= Configuration.new
15
17
  end
16
18
 
17
- # Yields a configuration object for the service
19
+ # Yields the service configuration to a block
18
20
  def configure
19
21
  yield config
20
22
  end
21
-
23
+
24
+ # Loads default configuration settings
25
+ def load_defaults!
26
+ require_relative "extraction/defaults"
27
+ end
28
+
29
+ def build_extractor(adapter_name = nil)
30
+ Extractor.build(adapter_name)
31
+ end
32
+
22
33
  end
23
34
 
24
35
  end
25
36
  end
26
37
 
38
+ Dir[File.join(__dir__, "extraction", "adapters", "*_adapter.rb")].each { |adapter| require(adapter) }
@@ -1,25 +1,59 @@
1
+ require_relative "adapters/registry"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
6
 
5
- KNOWN_ADAPTERS = [:fits, :tika]
6
-
7
7
  class << self
8
+
9
+ # Accessor for the name of the default adapter
10
+ attr_accessor :default
11
+
12
+ # Return the requested adapter by name.
13
+ # If a name is not supplied, return the default adapter.
14
+ # @see .get_default_adapter
15
+ #
16
+ # @param adapter_name [Symbol] the name of the requested adapter.
17
+ # @return [Class] the adapter class requested.
18
+ def get_adapter(adapter_name = nil)
19
+ if adapter_name
20
+ Registry.instance.adapters[adapter_name.to_sym]
21
+ else
22
+ get_default_adapter
23
+ end
24
+ end
8
25
 
9
- def get_adapter(adapter_name)
10
- require_relative "adapters/#{adapter_name}_adapter"
11
- class_name = "#{adapter_name.to_s.capitalize}Adapter"
12
- const_get(class_name.to_sym, false)
26
+ # Return the default adapter.
27
+ # Raises an exception if the default adapter has not been configured.
28
+ def get_default_adapter
29
+ raise "The default adapter has not been configured." unless default
30
+ get_adapter(default)
13
31
  end
14
32
 
15
- KNOWN_ADAPTERS.each do |adapter|
16
- define_method(adapter) do
17
- get_adapter(adapter)
33
+ # Registers an adapter.
34
+ # @see Registry#register
35
+ #
36
+ # @param name [Symbol] the name of the adapter.
37
+ # @param adapter [Class] the adapter class to register.
38
+ def register(name, adapter)
39
+ Registry.instance.register(name, adapter)
40
+ end
41
+
42
+ # Creates methods to access each adapter.
43
+ Registry.instance.adapters.each do |name, adapter|
44
+ define_method(name) do
45
+ adapter
18
46
  end
19
47
  end
20
48
 
49
+ def method_missing(name, *args)
50
+ return get_adapter(name) if Registry.instance.adapters.key?(name.to_sym)
51
+ super
52
+ end
21
53
  end
22
54
 
23
55
  end
24
56
  end
25
57
  end
58
+
59
+ Dir[File.join(__dir__, "adapters", "*_adapter.rb")].each { |adapter| require(adapter) }
@@ -0,0 +1,55 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+ class Adapter
5
+
6
+ # Supported extraction output types
7
+ OUTPUT_TYPES = [:text, :metadata]
8
+
9
+ class << self
10
+ # Register the adapter
11
+ def register(adapter_name)
12
+ Ddr::Extraction::Adapters.register(adapter_name, self)
13
+ end
14
+ end
15
+
16
+ # Extract a kind of output from the file path
17
+ #
18
+ # @param output [Symbol] the kind of output, `:text` or `:metadata`
19
+ # @param file_path [String] path to the file to be processed
20
+ # @return [IO] the result of the extraction
21
+ # @api public
22
+ def extract(output, file_path)
23
+ raise ArgumentError, "Output type must be one of #{OUTPUT_TYPES}." unless OUTPUT_TYPES.include?(output)
24
+ raise IOError, "File not found: #{file_path}" unless File.exist?(file_path)
25
+ execute(command(output, file_path))
26
+ end
27
+
28
+ private
29
+
30
+ # Returns the command to be executed
31
+ #
32
+ # @param output [Symbol] the kind of output.
33
+ # @param file_path [String] path to the file to be processed.
34
+ # @return [String, Array] the command as a String or Array
35
+ # @see #extract
36
+ # @see #execute
37
+ # @api private
38
+ def command(output, file_path)
39
+ raise NotImplementedError, "The `command' instance method must be implemented by the adapter."
40
+ end
41
+
42
+ # Executes the command in a subprocess.
43
+ #
44
+ # @param cmd [String, Array] the command as a String or Array
45
+ # @see Ruby documentation for IO.popen
46
+ # @return [IO] the output of the command.
47
+ # @api private
48
+ def execute(cmd)
49
+ IO.popen(cmd)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
55
+ end
@@ -1,22 +1,26 @@
1
+ require_relative "adapter"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
- class FitsAdapter
6
+ class FitsAdapter < Adapter
5
7
 
6
- # Return metadata extracted from file
7
- #
8
- # @param file [String] the file from which to extract metadata.
9
- # @return [IO] the output
10
- def extract_metadata(file)
11
- IO.popen([self.class.path, "-i", file])
12
- end
8
+ register :fits
13
9
 
14
10
  class << self
15
11
  # Path to FITS executable (fits.sh or fits.bat)
16
12
  attr_accessor :path
17
13
  end
18
14
 
15
+ private
16
+
17
+ def command(output, file_path)
18
+ raise "This adapter only supports :metadata output." unless output == :metadata
19
+ [self.class.path, "-i", file_path]
20
+ end
21
+
19
22
  end
20
23
  end
21
24
  end
22
25
  end
26
+
@@ -0,0 +1,21 @@
1
+ require_relative "adapter"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ module Adapters
6
+ class NullAdapter < Adapter
7
+
8
+ register :null
9
+
10
+ private
11
+
12
+ def command(output, file_path)
13
+ "echo"
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+ end
20
+ end
21
+
@@ -0,0 +1,42 @@
1
+ require "singleton"
2
+ require_relative "adapter"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ module Adapters
7
+ #
8
+ # Registry of adapter names and classes
9
+ #
10
+ class Registry
11
+ include Singleton
12
+
13
+ attr_reader :adapters
14
+
15
+ def initialize
16
+ @adapters = {}
17
+ end
18
+
19
+ # Registers an adapter
20
+ #
21
+ # @param name [Symbol] the name of the adapter.
22
+ # @param adapter [Class] the adapter to be registered.
23
+ def register(name, adapter)
24
+ name = name.to_sym
25
+ validate!(name, adapter)
26
+ adapters[name] = adapter
27
+ end
28
+
29
+ private
30
+
31
+ def validate!(name, adapter)
32
+ raise "Another adapter is registered under the name :#{name}." if adapters.key?(name)
33
+ unless adapter < Adapter
34
+ raise ArgumentError, "Only subclasses of Ddr::Extraction::Adapters::Adapter may be registered."
35
+ end
36
+ raise "The adapter #{adapter.to_s} is already registered." if adapters.value?(adapter)
37
+ end
38
+ end
39
+
40
+ end
41
+ end
42
+ end
@@ -1,15 +1,11 @@
1
+ require_relative "adapter"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
- class TikaAdapter
5
-
6
- # Extract text from file
7
- #
8
- # @param file [String] path to file from which to extract text
9
- # @return [IO] the output
10
- def extract_text(file)
11
- IO.popen(["java", "-jar", self.class.path, "--text", file])
12
- end
6
+ class TikaAdapter < Adapter
7
+
8
+ register :tika
13
9
 
14
10
  class << self
15
11
  # Path to tika-app.jar
@@ -17,9 +13,25 @@ module Ddr
17
13
 
18
14
  # Tika server port (optional, required for server)
19
15
  attr_accessor :port
20
- end
16
+ end
17
+
18
+ private
19
+
20
+ def command(output, file_path)
21
+ ["java", "-jar", self.class.path, output_options(output), file_path].flatten
22
+ end
23
+
24
+ def output_options(output)
25
+ case output
26
+ when :text
27
+ "--text"
28
+ when :metadata
29
+ ["--metadata", "--xml"]
30
+ end
31
+ end
21
32
 
22
33
  end
34
+
23
35
  end
24
36
  end
25
37
  end
@@ -1,22 +1,16 @@
1
- require_relative "adapter"
2
1
  require_relative "adapters"
3
2
 
4
3
  module Ddr
5
4
  module Extraction
6
5
  class Configuration
7
6
 
7
+ # Returns an object have settable attributes for adapters.
8
8
  def adapters
9
9
  config = Adapters
10
10
  yield config if block_given?
11
11
  config
12
12
  end
13
13
 
14
- def adapter
15
- config = Adapter
16
- yield config if block_given?
17
- config
18
- end
19
-
20
14
  end
21
15
  end
22
16
  end
@@ -3,8 +3,7 @@ require "ddr-extraction"
3
3
  bin_dir = File.expand_path("../../../../bin", __FILE__)
4
4
 
5
5
  Ddr::Extraction.configure do |config|
6
- config.adapter.text = :tika
7
- config.adapter.metadata = :fits
6
+ config.adapters.default = :tika
8
7
  config.adapters.tika.path = File.join(bin_dir, "tika-app.jar")
9
8
  config.adapters.fits.path = File.join(bin_dir, "fits", "fits.sh")
10
9
  end
@@ -1,22 +1,36 @@
1
+ require "delegate"
1
2
  require_relative "adapters"
2
3
 
3
4
  module Ddr
4
5
  module Extraction
5
- class Extractor
6
+ #
7
+ # The Extractor is the main public class.
8
+ #
9
+ # It works by delegating to an adapter that does the real work.
10
+ #
11
+ # extractor = Ddr::Extraction::Extractor.build(:tika)
12
+ # text = extractor.extract(:text, "/path/to/text/file")
13
+ # puts text.read
14
+ # ...
15
+ #
16
+ class Extractor < ::SimpleDelegator
6
17
 
7
- # Extracts a type of content from a file
8
- #
9
- # @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
10
- # @param file [String] path to file from which to extract content.
11
- # @return [IO] the output
12
- def extract(type, file)
13
- adapter(type).send("extract_#{type}", file)
14
- end
18
+ class << self
15
19
 
16
- private
20
+ # Returns/yields an extractor instance
21
+ #
22
+ # @param adapter_name [Symbol] the name of the adapter to plug in.
23
+ # If not given, a default adapter will be used, if
24
+ # Ddr::Extraction::Adapters.default has been set with
25
+ # the name of the default adapter.
26
+ #
27
+ def build(adapter_name = nil)
28
+ adapter = Adapters.get_adapter(adapter_name)
29
+ extractor = new(adapter.new)
30
+ yield extractor if block_given?
31
+ extractor
32
+ end
17
33
 
18
- def adapter(type)
19
- Adapter.build_adapter(type)
20
34
  end
21
35
 
22
36
  end
@@ -1,5 +1,5 @@
1
1
  module Ddr
2
2
  module Extraction
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -3,16 +3,18 @@ module Ddr
3
3
  RSpec.describe Extractor do
4
4
 
5
5
  describe "extracting text" do
6
- let(:file) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
6
+ subject { described_class.build(:tika) }
7
+ let(:file_path) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
7
8
  it "should extract the text content of the file" do
8
- expect(subject.extract(:text, file).read).to match(/This is a sample document./)
9
+ expect(subject.extract(:text, file_path).read).to match(/This is a sample document./)
9
10
  end
10
11
  end
11
12
 
12
13
  describe "extracting metadata" do
13
- let(:file) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
+ subject { described_class.build(:tika) }
15
+ let(:file_path) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
16
  it "should extract technical metadata from the file" do
15
- expect(subject.extract(:metadata, file).read.length).to_not eq(0)
17
+ expect(subject.extract(:metadata, file_path).read.length).to_not eq(0)
16
18
  end
17
19
  end
18
20
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ddr-extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Chandek-Stark
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-13 00:00:00.000000000 Z
11
+ date: 2014-11-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -71,9 +71,11 @@ files:
71
71
  - ddr-extraction.gemspec
72
72
  - lib/ddr-extraction.rb
73
73
  - lib/ddr/extraction.rb
74
- - lib/ddr/extraction/adapter.rb
75
74
  - lib/ddr/extraction/adapters.rb
75
+ - lib/ddr/extraction/adapters/adapter.rb
76
76
  - lib/ddr/extraction/adapters/fits_adapter.rb
77
+ - lib/ddr/extraction/adapters/null_adapter.rb
78
+ - lib/ddr/extraction/adapters/registry.rb
77
79
  - lib/ddr/extraction/adapters/tika_adapter.rb
78
80
  - lib/ddr/extraction/configuration.rb
79
81
  - lib/ddr/extraction/defaults.rb
@@ -1,21 +0,0 @@
1
- require "delegate"
2
- require_relative "adapters"
3
-
4
- module Ddr
5
- module Extraction
6
- class Adapter < ::SimpleDelegator
7
-
8
- class << self
9
- # Accessors for adapter types
10
- attr_accessor :text, :metadata
11
-
12
- def build_adapter(type)
13
- adapter_name = send(type)
14
- adapter = Adapters.get_adapter(adapter_name)
15
- new(adapter.new)
16
- end
17
- end
18
-
19
- end
20
- end
21
- end