ddr-extraction 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1e6c32f2c82cc25e4a8c7b65b98becae648925ad
4
- data.tar.gz: 9f8f4da7e5e4c7db8fc66dcddda8bff671642572
3
+ metadata.gz: 2975444782fb458e450a8acb9ec54a690d457241
4
+ data.tar.gz: 5d2cedbf73284f32b7c5497d6a49d56aaaa1c0ee
5
5
  SHA512:
6
- metadata.gz: 8411bd09f0cb81d7cb16827fd3098f474c21fcf407da1ec38e2936fc3fc9982e892de08cf6ed747901fce92b6aef3d8cfa22c4a6f4bb2a4269fbb659adca539c
7
- data.tar.gz: 30967c3ecb2bd79fa5895e7198cb15d29b45be5eceb876ce2a60dd94371b4387296daac635b1fea7a0f38270dd59e62fdc11ed0da2a9be087a4067a2b64cffb9
6
+ metadata.gz: f72b6e7224081f193cbf4b8977167d84f995e467346d60590c48133746e74cd29622ee8d1d6fb3dbc426ec37339ec5e7985c08d5e3755268b1a46dac02bb6f07
7
+ data.tar.gz: ec401d6d43201348ac80bf5ce13dfcfd854918ed39cb49bdc990c86fd3b682ced86db425387e481f57e9f39bdbd6e7623ee88b44d8f5674052e5e8d07cabb8d5
data/README.md CHANGED
@@ -6,7 +6,7 @@ Pluggable file text and metadata extraction service.
6
6
 
7
7
  Add this line to your application's Gemfile:
8
8
 
9
- gem 'ddr-extractor'
9
+ gem 'ddr-extraction'
10
10
 
11
11
  And then execute:
12
12
 
@@ -14,68 +14,45 @@ And then execute:
14
14
 
15
15
  Or install it yourself as:
16
16
 
17
- $ gem install ddr-extractor
17
+ $ gem install ddr-extraction
18
18
 
19
- ## Usage
19
+ ## Dependencies
20
+
21
+ The gem has no external dependencies of its own. Consult the documentation for each extraction tool used by your configuration.
22
+
23
+ ## Configuration
24
+
25
+ `Ddr::Extraction` includes default configurations for [Aapche Tika](http://tika.apache.org/) (text and metadata extraction) and [FITS](http://fitstool.org/) (metadata only). Tika is set as the default adapter when one is not specified to the builder.
20
26
 
27
+ ```ruby
28
+ require "ddr-extraction
29
+ Ddr::Extraction.load_defaults!
21
30
  ```
22
- >> extractor = Ddr::Extraction::Extractor.new
23
- => #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
24
31
 
25
- >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
26
- => #<IO:fd 11>
32
+ There are rake tasks for downloading Tika and FITS to expected locations.
33
+
34
+ ```sh
35
+ rake tika:download
36
+ rake fits:download
37
+ ```
38
+
39
+ Configuration Example
27
40
 
41
+ ```ruby
42
+ Ddr::Extraction.configure do |config|
43
+ config.adapters.default = :tika # Use Tika as the default adapter
44
+ config.adapters.tika.path = "/path/to/tika-app.jar"
45
+ config.adapters.fits.path = "/path/to/fits.sh"
46
+ end
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ```
52
+ >> extractor = Ddr::Extraction.build_extractor
53
+ >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
28
54
  >> puts text.read
29
55
  This is a sample document.
30
-
31
- >> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
32
- => #<IO:fd 12>
33
-
34
- >> puts metadata.read
35
- <?xml version="1.0" encoding="UTF-8"?>
36
- <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
37
- <identification>
38
- <identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
39
- <tool toolname="Exiftool" toolversion="9.13" />
40
- <tool toolname="Droid" toolversion="6.1.3" />
41
- <tool toolname="ffident" toolversion="0.2" />
42
- <tool toolname="Tika" toolversion="1.3" />
43
- <version toolname="Droid" toolversion="6.1.3">1.0</version>
44
- <externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
45
- </identity>
46
- </identification>
47
- <fileinfo>
48
- <lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
49
- <filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
50
- <filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
51
- <size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
52
- <md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
53
- <fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
54
- </fileinfo>
55
- <filestatus />
56
- <metadata>
57
- <image>
58
- <compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
59
- <compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
60
- <imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
61
- <imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
62
- <orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
63
- </image>
64
- </metadata>
65
- <statistics fitsExecutionTime="791">
66
- <tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
67
- <tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
68
- <tool toolname="Jhove" toolversion="1.5" executionTime="556" />
69
- <tool toolname="file utility" toolversion="5.04" executionTime="623" />
70
- <tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
71
- <tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
72
- <tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
73
- <tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
74
- <tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
75
- <tool toolname="ffident" toolversion="0.2" executionTime="369" />
76
- <tool toolname="Tika" toolversion="1.3" executionTime="356" />
77
- </statistics>
78
- </fits>
79
56
  ```
80
57
 
81
58
  ## Contributing
@@ -1,26 +1,38 @@
1
1
  require_relative "extraction/version"
2
2
  require_relative "extraction/configuration"
3
3
  require_relative "extraction/extractor"
4
+ require_relative "extraction/adapters"
4
5
 
5
6
  module Ddr
6
7
  #
7
- # Ddr::Extraction - A file text and metadata extraction service.
8
+ # Ddr::Extraction - A pluggable content extraction service.
8
9
  #
9
10
  module Extraction
10
11
 
11
12
  class << self
12
13
 
14
+ # Returns the service configuration
13
15
  def config
14
16
  @config ||= Configuration.new
15
17
  end
16
18
 
17
- # Yields a configuration object for the service
19
+ # Yields the service configuration to a block
18
20
  def configure
19
21
  yield config
20
22
  end
21
-
23
+
24
+ # Loads default configuration settings
25
+ def load_defaults!
26
+ require_relative "extraction/defaults"
27
+ end
28
+
29
+ def build_extractor(adapter_name = nil)
30
+ Extractor.build(adapter_name)
31
+ end
32
+
22
33
  end
23
34
 
24
35
  end
25
36
  end
26
37
 
38
+ Dir[File.join(__dir__, "extraction", "adapters", "*_adapter.rb")].each { |adapter| require(adapter) }
@@ -1,25 +1,59 @@
1
+ require_relative "adapters/registry"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
6
 
5
- KNOWN_ADAPTERS = [:fits, :tika]
6
-
7
7
  class << self
8
+
9
+ # Accessor for the name of the default adapter
10
+ attr_accessor :default
11
+
12
+ # Return the requested adapter by name.
13
+ # If a name is not supplied, return the default adapter.
14
+ # @see .get_default_adapter
15
+ #
16
+ # @param adapter_name [Symbol] the name of the requested adapter.
17
+ # @return [Class] the adapter class requested.
18
+ def get_adapter(adapter_name = nil)
19
+ if adapter_name
20
+ Registry.instance.adapters[adapter_name.to_sym]
21
+ else
22
+ get_default_adapter
23
+ end
24
+ end
8
25
 
9
- def get_adapter(adapter_name)
10
- require_relative "adapters/#{adapter_name}_adapter"
11
- class_name = "#{adapter_name.to_s.capitalize}Adapter"
12
- const_get(class_name.to_sym, false)
26
+ # Return the default adapter.
27
+ # Raises an exception if the default adapter has not been configured.
28
+ def get_default_adapter
29
+ raise "The default adapter has not been configured." unless default
30
+ get_adapter(default)
13
31
  end
14
32
 
15
- KNOWN_ADAPTERS.each do |adapter|
16
- define_method(adapter) do
17
- get_adapter(adapter)
33
+ # Registers an adapter.
34
+ # @see Registry#register
35
+ #
36
+ # @param name [Symbol] the name of the adapter.
37
+ # @param adapter [Class] the adapter class to register.
38
+ def register(name, adapter)
39
+ Registry.instance.register(name, adapter)
40
+ end
41
+
42
+ # Creates methods to access each adapter.
43
+ Registry.instance.adapters.each do |name, adapter|
44
+ define_method(name) do
45
+ adapter
18
46
  end
19
47
  end
20
48
 
49
+ def method_missing(name, *args)
50
+ return get_adapter(name) if Registry.instance.adapters.key?(name.to_sym)
51
+ super
52
+ end
21
53
  end
22
54
 
23
55
  end
24
56
  end
25
57
  end
58
+
59
+ Dir[File.join(__dir__, "adapters", "*_adapter.rb")].each { |adapter| require(adapter) }
@@ -0,0 +1,55 @@
1
+ module Ddr
2
+ module Extraction
3
+ module Adapters
4
+ class Adapter
5
+
6
+ # Supported extraction output types
7
+ OUTPUT_TYPES = [:text, :metadata]
8
+
9
+ class << self
10
+ # Register the adapter
11
+ def register(adapter_name)
12
+ Ddr::Extraction::Adapters.register(adapter_name, self)
13
+ end
14
+ end
15
+
16
+ # Extract a kind of output from the file path
17
+ #
18
+ # @param output [Symbol] the kind of output, `:text` or `:metadata`
19
+ # @param file_path [String] path to the file to be processed
20
+ # @return [IO] the result of the extraction
21
+ # @api public
22
+ def extract(output, file_path)
23
+ raise ArgumentError, "Output type must be one of #{OUTPUT_TYPES}." unless OUTPUT_TYPES.include?(output)
24
+ raise IOError, "File not found: #{file_path}" unless File.exist?(file_path)
25
+ execute(command(output, file_path))
26
+ end
27
+
28
+ private
29
+
30
+ # Returns the command to be executed
31
+ #
32
+ # @param output [Symbol] the kind of output.
33
+ # @param file_path [String] path to the file to be processed.
34
+ # @return [String, Array] the command as a String or Array
35
+ # @see #extract
36
+ # @see #execute
37
+ # @api private
38
+ def command(output, file_path)
39
+ raise NotImplementedError, "The `command' instance method must be implemented by the adapter."
40
+ end
41
+
42
+ # Executes the command in a subprocess.
43
+ #
44
+ # @param cmd [String, Array] the command as a String or Array
45
+ # @see Ruby documentation for IO.popen
46
+ # @return [IO] the output of the command.
47
+ # @api private
48
+ def execute(cmd)
49
+ IO.popen(cmd)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
55
+ end
@@ -1,22 +1,26 @@
1
+ require_relative "adapter"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
- class FitsAdapter
6
+ class FitsAdapter < Adapter
5
7
 
6
- # Return metadata extracted from file
7
- #
8
- # @param file [String] the file from which to extract metadata.
9
- # @return [IO] the output
10
- def extract_metadata(file)
11
- IO.popen([self.class.path, "-i", file])
12
- end
8
+ register :fits
13
9
 
14
10
  class << self
15
11
  # Path to FITS executable (fits.sh or fits.bat)
16
12
  attr_accessor :path
17
13
  end
18
14
 
15
+ private
16
+
17
+ def command(output, file_path)
18
+ raise "This adapter only supports :metadata output." unless output == :metadata
19
+ [self.class.path, "-i", file_path]
20
+ end
21
+
19
22
  end
20
23
  end
21
24
  end
22
25
  end
26
+
@@ -0,0 +1,21 @@
1
+ require_relative "adapter"
2
+
3
+ module Ddr
4
+ module Extraction
5
+ module Adapters
6
+ class NullAdapter < Adapter
7
+
8
+ register :null
9
+
10
+ private
11
+
12
+ def command(output, file_path)
13
+ "echo"
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+ end
20
+ end
21
+
@@ -0,0 +1,42 @@
1
+ require "singleton"
2
+ require_relative "adapter"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ module Adapters
7
+ #
8
+ # Registry of adapter names and classes
9
+ #
10
+ class Registry
11
+ include Singleton
12
+
13
+ attr_reader :adapters
14
+
15
+ def initialize
16
+ @adapters = {}
17
+ end
18
+
19
+ # Registers an adapter
20
+ #
21
+ # @param name [Symbol] the name of the adapter.
22
+ # @param adapter [Class] the adapter to be registered.
23
+ def register(name, adapter)
24
+ name = name.to_sym
25
+ validate!(name, adapter)
26
+ adapters[name] = adapter
27
+ end
28
+
29
+ private
30
+
31
+ def validate!(name, adapter)
32
+ raise "Another adapter is registered under the name :#{name}." if adapters.key?(name)
33
+ unless adapter < Adapter
34
+ raise ArgumentError, "Only subclasses of Ddr::Extraction::Adapters::Adapter may be registered."
35
+ end
36
+ raise "The adapter #{adapter.to_s} is already registered." if adapters.value?(adapter)
37
+ end
38
+ end
39
+
40
+ end
41
+ end
42
+ end
@@ -1,15 +1,11 @@
1
+ require_relative "adapter"
2
+
1
3
  module Ddr
2
4
  module Extraction
3
5
  module Adapters
4
- class TikaAdapter
5
-
6
- # Extract text from file
7
- #
8
- # @param file [String] path to file from which to extract text
9
- # @return [IO] the output
10
- def extract_text(file)
11
- IO.popen(["java", "-jar", self.class.path, "--text", file])
12
- end
6
+ class TikaAdapter < Adapter
7
+
8
+ register :tika
13
9
 
14
10
  class << self
15
11
  # Path to tika-app.jar
@@ -17,9 +13,25 @@ module Ddr
17
13
 
18
14
  # Tika server port (optional, required for server)
19
15
  attr_accessor :port
20
- end
16
+ end
17
+
18
+ private
19
+
20
+ def command(output, file_path)
21
+ ["java", "-jar", self.class.path, output_options(output), file_path].flatten
22
+ end
23
+
24
+ def output_options(output)
25
+ case output
26
+ when :text
27
+ "--text"
28
+ when :metadata
29
+ ["--metadata", "--xml"]
30
+ end
31
+ end
21
32
 
22
33
  end
34
+
23
35
  end
24
36
  end
25
37
  end
@@ -1,22 +1,16 @@
1
- require_relative "adapter"
2
1
  require_relative "adapters"
3
2
 
4
3
  module Ddr
5
4
  module Extraction
6
5
  class Configuration
7
6
 
7
+ # Returns an object have settable attributes for adapters.
8
8
  def adapters
9
9
  config = Adapters
10
10
  yield config if block_given?
11
11
  config
12
12
  end
13
13
 
14
- def adapter
15
- config = Adapter
16
- yield config if block_given?
17
- config
18
- end
19
-
20
14
  end
21
15
  end
22
16
  end
@@ -3,8 +3,7 @@ require "ddr-extraction"
3
3
  bin_dir = File.expand_path("../../../../bin", __FILE__)
4
4
 
5
5
  Ddr::Extraction.configure do |config|
6
- config.adapter.text = :tika
7
- config.adapter.metadata = :fits
6
+ config.adapters.default = :tika
8
7
  config.adapters.tika.path = File.join(bin_dir, "tika-app.jar")
9
8
  config.adapters.fits.path = File.join(bin_dir, "fits", "fits.sh")
10
9
  end
@@ -1,22 +1,36 @@
1
+ require "delegate"
1
2
  require_relative "adapters"
2
3
 
3
4
  module Ddr
4
5
  module Extraction
5
- class Extractor
6
+ #
7
+ # The Extractor is the main public class.
8
+ #
9
+ # It works by delegating to an adapter that does the real work.
10
+ #
11
+ # extractor = Ddr::Extraction::Extractor.build(:tika)
12
+ # text = extractor.extract(:text, "/path/to/text/file")
13
+ # puts text.read
14
+ # ...
15
+ #
16
+ class Extractor < ::SimpleDelegator
6
17
 
7
- # Extracts a type of content from a file
8
- #
9
- # @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
10
- # @param file [String] path to file from which to extract content.
11
- # @return [IO] the output
12
- def extract(type, file)
13
- adapter(type).send("extract_#{type}", file)
14
- end
18
+ class << self
15
19
 
16
- private
20
+ # Returns/yields an extractor instance
21
+ #
22
+ # @param adapter_name [Symbol] the name of the adapter to plug in.
23
+ # If not given, a default adapter will be used, if
24
+ # Ddr::Extraction::Adapters.default has been set with
25
+ # the name of the default adapter.
26
+ #
27
+ def build(adapter_name = nil)
28
+ adapter = Adapters.get_adapter(adapter_name)
29
+ extractor = new(adapter.new)
30
+ yield extractor if block_given?
31
+ extractor
32
+ end
17
33
 
18
- def adapter(type)
19
- Adapter.build_adapter(type)
20
34
  end
21
35
 
22
36
  end
@@ -1,5 +1,5 @@
1
1
  module Ddr
2
2
  module Extraction
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -3,16 +3,18 @@ module Ddr
3
3
  RSpec.describe Extractor do
4
4
 
5
5
  describe "extracting text" do
6
- let(:file) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
6
+ subject { described_class.build(:tika) }
7
+ let(:file_path) { File.expand_path("../../fixtures/sample.docx", __FILE__) }
7
8
  it "should extract the text content of the file" do
8
- expect(subject.extract(:text, file).read).to match(/This is a sample document./)
9
+ expect(subject.extract(:text, file_path).read).to match(/This is a sample document./)
9
10
  end
10
11
  end
11
12
 
12
13
  describe "extracting metadata" do
13
- let(:file) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
+ subject { described_class.build(:tika) }
15
+ let(:file_path) { File.expand_path("../../fixtures/blue-devil.png", __FILE__) }
14
16
  it "should extract technical metadata from the file" do
15
- expect(subject.extract(:metadata, file).read.length).to_not eq(0)
17
+ expect(subject.extract(:metadata, file_path).read.length).to_not eq(0)
16
18
  end
17
19
  end
18
20
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ddr-extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Chandek-Stark
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-13 00:00:00.000000000 Z
11
+ date: 2014-11-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -71,9 +71,11 @@ files:
71
71
  - ddr-extraction.gemspec
72
72
  - lib/ddr-extraction.rb
73
73
  - lib/ddr/extraction.rb
74
- - lib/ddr/extraction/adapter.rb
75
74
  - lib/ddr/extraction/adapters.rb
75
+ - lib/ddr/extraction/adapters/adapter.rb
76
76
  - lib/ddr/extraction/adapters/fits_adapter.rb
77
+ - lib/ddr/extraction/adapters/null_adapter.rb
78
+ - lib/ddr/extraction/adapters/registry.rb
77
79
  - lib/ddr/extraction/adapters/tika_adapter.rb
78
80
  - lib/ddr/extraction/configuration.rb
79
81
  - lib/ddr/extraction/defaults.rb
@@ -1,21 +0,0 @@
1
- require "delegate"
2
- require_relative "adapters"
3
-
4
- module Ddr
5
- module Extraction
6
- class Adapter < ::SimpleDelegator
7
-
8
- class << self
9
- # Accessors for adapter types
10
- attr_accessor :text, :metadata
11
-
12
- def build_adapter(type)
13
- adapter_name = send(type)
14
- adapter = Adapters.get_adapter(adapter_name)
15
- new(adapter.new)
16
- end
17
- end
18
-
19
- end
20
- end
21
- end