ddr-extraction 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c5403a9bee4dc10e433bca20f0a8ee11178a99dc
4
- data.tar.gz: d04210fc66bfaa0e8e368fb82f3236c7c2510d38
3
+ metadata.gz: 23d86fadee99408637cfc774cdd62efdaff04e7d
4
+ data.tar.gz: 03b9386486aa83632f5d96fcc65f209b7ad32952
5
5
  SHA512:
6
- metadata.gz: 7e300ee401d7cb26eb2cc259723b3a94009f9c2b4296dfa98870a401a67c4ba4c185fe8a8c5a85444c3db0d55c5a4916ef0d5d91cbdfa00356a904ef1cb2bdd0
7
- data.tar.gz: 6ad6b2c009439ea8b378df9cfc6accd45c3f169b7967bf8047b3864cc7ae91caa1eefc87779aaff6128e1b15b86e2dec7a8d69b4099d2ed278c747a8bce863d4
6
+ metadata.gz: afd68b70ea7ee45bed52c1debfabd88c4ebef23d40d997232c4452061f19ac64d0d2791fc90990807d456e468c660ed553538aaaddf59de731643126e3ab89bb
7
+ data.tar.gz: df8032a81665c2a4592df5a3ff96d3ff8f490dc1c8f16e4f1de482895fe119f48a5dc796e51d0b98aefea78080e1418d2c685a08ba73e94d7df2752322d07098
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
1
  Gemfile.lock
2
2
  bin/tika-*
3
3
  bin/fits-*
4
+ pkg
4
5
  tmp
data/README.md CHANGED
@@ -18,7 +18,65 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO
21
+ ```
22
+ >> extractor = Ddr::Extraction::Extractor.new
23
+ => #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
24
+
25
+ >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
26
+ => #<IO:fd 11>
27
+
28
+ >> puts text.read
29
+ This is a sample document.
30
+
31
+ >> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
32
+ => #<IO:fd 12>
33
+
34
+ >> puts metadata.read
35
+ <?xml version="1.0" encoding="UTF-8"?>
36
+ <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
37
+ <identification>
38
+ <identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
39
+ <tool toolname="Exiftool" toolversion="9.13" />
40
+ <tool toolname="Droid" toolversion="6.1.3" />
41
+ <tool toolname="ffident" toolversion="0.2" />
42
+ <tool toolname="Tika" toolversion="1.3" />
43
+ <version toolname="Droid" toolversion="6.1.3">1.0</version>
44
+ <externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
45
+ </identity>
46
+ </identification>
47
+ <fileinfo>
48
+ <lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
49
+ <filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
50
+ <filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
51
+ <size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
52
+ <md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
53
+ <fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
54
+ </fileinfo>
55
+ <filestatus />
56
+ <metadata>
57
+ <image>
58
+ <compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
59
+ <compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
60
+ <imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
61
+ <imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
62
+ <orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
63
+ </image>
64
+ </metadata>
65
+ <statistics fitsExecutionTime="791">
66
+ <tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
67
+ <tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
68
+ <tool toolname="Jhove" toolversion="1.5" executionTime="556" />
69
+ <tool toolname="file utility" toolversion="5.04" executionTime="623" />
70
+ <tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
71
+ <tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
72
+ <tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
73
+ <tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
74
+ <tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
75
+ <tool toolname="ffident" toolversion="0.2" executionTime="369" />
76
+ <tool toolname="Tika" toolversion="1.3" executionTime="356" />
77
+ </statistics>
78
+ </fits>
79
+ ```
22
80
 
23
81
  ## Contributing
24
82
 
@@ -1,5 +1,5 @@
1
1
  require "ddr/extraction/version"
2
- require "ddr/extraction/adapters"
2
+ require "ddr/extraction/configuration"
3
3
  require "ddr/extraction/extractor"
4
4
 
5
5
  module Ddr
@@ -10,38 +10,12 @@ module Ddr
10
10
 
11
11
  class << self
12
12
 
13
- attr_accessor :text_adapter, :metadata_adapter
13
+ attr_reader :config
14
14
 
15
- # Yields a configurable object for the named adapter.
16
- #
17
- # @param adapter [Symbol] the name of the adapter - e.g., `:tika`, `:fits`
18
- def configure_adapter(adapter, &block)
19
- yield Adapters.get_adapter(adapter)
20
- end
21
-
22
- def adapters
23
- Adapters.config
24
- end
25
-
26
- def set_defaults
27
- bin_dir = File.expand_path("../../../bin", __FILE__)
28
-
29
- configure_adapter :tika do |tika|
30
- tika.version = "1.6"
31
- tika.path = File.join(bin_dir, "tika-app.jar")
32
- tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika.version}.jar"
33
- tika.checksum = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
34
- tika.checksum_type = :SHA1
35
- end
36
-
37
- configure_adapter :fits do |fits|
38
- fits.version = "0.8.3"
39
- fits.path = File.join(bin_dir, "fits-#{fits.version}", "fits.sh")
40
- fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits.version}.zip"
41
- end
42
-
43
- adapters.text = :tika
44
- adapters.metadata = :fits
15
+ # Yields a configuration object for the service
16
+ def configure
17
+ @config ||= Configuration.new
18
+ yield @config
45
19
  end
46
20
 
47
21
  end
@@ -49,4 +23,3 @@ module Ddr
49
23
  end
50
24
  end
51
25
 
52
- Ddr::Extraction.set_defaults
@@ -0,0 +1,21 @@
1
+ require "delegate"
2
+ require_relative "adapters"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ class Adapter < ::SimpleDelegator
7
+
8
+ class << self
9
+ # Accessors for adapter types
10
+ attr_accessor :text, :metadata
11
+
12
+ def build_adapter(type)
13
+ adapter_name = send(type)
14
+ adapter = Adapters.get_adapter(adapter_name)
15
+ new(adapter.new)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -8,18 +8,6 @@ module Ddr
8
8
  const_get(class_name.to_sym, false)
9
9
  end
10
10
 
11
- def self.build_adapter(type)
12
- adapter_name = config.send(type)
13
- adapter = get_adapter(adapter_name)
14
- adapter.new
15
- end
16
-
17
- Config = Struct.new(:text, :metadata)
18
-
19
- def self.config
20
- @@config ||= Config.new
21
- end
22
-
23
11
  end
24
12
  end
25
13
  end
@@ -1,14 +1,12 @@
1
- require_relative "adapter"
2
-
3
1
  module Ddr
4
2
  module Extraction
5
3
  module Adapters
6
- class FitsAdapter < Adapter
4
+ class FitsAdapter
7
5
 
8
6
  # Return metadata extracted from file
9
7
  #
10
8
  # @param file [String] the file from which to extract metadata.
11
- # @return [IO]
9
+ # @return [IO] the output
12
10
  def extract_metadata(file)
13
11
  IO.popen([self.class.path, "-i", file])
14
12
  end
@@ -1,14 +1,12 @@
1
- require_relative "adapter"
2
-
3
1
  module Ddr
4
2
  module Extraction
5
3
  module Adapters
6
- class TikaAdapter < Adapter
4
+ class TikaAdapter
7
5
 
8
6
  # Extract text from file
9
7
  #
10
8
  # @param file [String] path to file from which to extract text
11
- # @return [IO]
9
+ # @return [IO] the output
12
10
  def extract_text(file)
13
11
  IO.popen(["java", "-jar", self.class.path, "--text", file])
14
12
  end
@@ -26,8 +24,11 @@ module Ddr
26
24
  # URL to download distribution
27
25
  attr_accessor :download_url
28
26
 
27
+ # Verify checksum?
28
+ attr_accessor :verify_checksum
29
+
29
30
  # Tika distribution checksum
30
- attr_accessor :checksum
31
+ attr_accessor :checksum_value
31
32
 
32
33
  # Tika distribution checksum type
33
34
  attr_accessor :checksum_type
@@ -0,0 +1,22 @@
1
+ require_relative "adapter"
2
+ require_relative "adapters"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ class Configuration
7
+
8
+ def adapters(name)
9
+ config = Adapters.get_adapter(name)
10
+ yield config if block_given?
11
+ config
12
+ end
13
+
14
+ def adapter
15
+ config = Adapter
16
+ yield config if block_given?
17
+ config
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ require "ddr-extraction"
2
+
3
+ BIN_DIR = File.expand_path("../../../../bin", __FILE__)
4
+ TIKA_VERSION = "1.6"
5
+ FITS_VERSION = "0.8.3"
6
+
7
+ Ddr::Extraction.configure do |config|
8
+ config.adapter.text = :tika
9
+ config.adapter.metadata = :fits
10
+
11
+ config.adapters(:tika) do |tika|
12
+ tika.version = TIKA_VERSION
13
+ tika.path = File.join(BIN_DIR, "tika-app.jar")
14
+ tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{TIKA_VERSION}.jar"
15
+ tika.verify_checksum = true
16
+ tika.checksum_value = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
17
+ tika.checksum_type = :SHA1
18
+ end
19
+
20
+ config.adapters(:fits) do |fits|
21
+ fits.version = "0.8.3"
22
+ fits.path = File.join(BIN_DIR, "fits-#{FITS_VERSION}", "fits.sh")
23
+ fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{FITS_VERSION}.zip"
24
+ end
25
+ end
26
+
@@ -3,28 +3,20 @@ require_relative "adapters"
3
3
  module Ddr
4
4
  module Extraction
5
5
  class Extractor
6
- extend Forwardable
7
-
8
- def_delegator :text_adapter, :extract_text
9
- def_delegator :metadata_adapter, :extract_metadata
10
6
 
11
7
  # Extracts a type of content from a file
12
8
  #
13
9
  # @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
14
10
  # @param file [String] path to file from which to extract content.
15
- # @return
11
+ # @return [IO] the output
16
12
  def extract(type, file)
17
- send("extract_#{type}", file)
18
- end
13
+ adapter(type).send("extract_#{type}", file)
14
+ end
19
15
 
20
16
  private
21
17
 
22
- def text_adapter
23
- @text_adapter ||= Adapters.build_adapter(:text)
24
- end
25
-
26
- def metadata_adapter
27
- @metadata_adapter ||= Adapters.build_adapter(:metadata)
18
+ def adapter(type)
19
+ Adapter.build_adapter(type)
28
20
  end
29
21
 
30
22
  end
@@ -1,5 +1,5 @@
1
1
  module Ddr
2
2
  module Extraction
3
- VERSION = "0.1.0"
3
+ VERSION = "0.2.0"
4
4
  end
5
5
  end
@@ -1,4 +1,4 @@
1
- require "ddr-extraction"
1
+ require "ddr/extraction/defaults"
2
2
  require "openssl"
3
3
 
4
4
  DOWNLOAD_DIR = File.absolute_path("tmp")
@@ -7,18 +7,20 @@ namespace :tika do
7
7
  desc "Download Tika app"
8
8
  task :download => :download_dir do
9
9
  tika_app = File.join(DOWNLOAD_DIR, "tika-app.jar")
10
- Ddr::Extraction::Adapters::TikaAdapter.config do |tika|
11
- puts "Downloading Tika app ... "
12
- system "curl -L #{tika.download_url} -o #{tika_app}"
10
+ tika_config = Ddr::Extraction.config.adapters(:tika)
11
+ puts "Downloading Tika app ... "
12
+ system "curl -L #{tika_config.download_url} -o #{tika_app}"
13
+ if tika_config.verify_checksum
13
14
  puts "Verifiying checksum ... "
14
- digest = OpenSSL::Digest.const_get(tika.checksum_type).new
15
- digest << File.read(tika.path)
16
- if digest.to_s == tika.checksum
17
- FileUtils.mv(tika_app, tika.path)
18
- else
19
- puts "Checksums do not match!"
15
+ digest = OpenSSL::Digest.const_get(tika_config.checksum_type).new
16
+ digest << File.read(tika_config.path)
17
+ if digest.to_s != tika_config.checksum_value
18
+ puts "Checksums do not match -- aborting!"
19
+ FileUtils.remove_entry_secure(tika_app)
20
+ abort
20
21
  end
21
22
  end
23
+ FileUtils.mv(tika_app, tika_config.path)
22
24
  end
23
25
 
24
26
  # namespace :server do
@@ -32,13 +34,12 @@ namespace :fits do
32
34
  desc "Download FITS tool"
33
35
  task :download => :download_dir do
34
36
  fits_tool = File.join(DOWNLOAD_DIR, "fits.zip")
35
- Ddr::Extraction::Adapters::FitsAdapter.config do |fits|
36
- puts "Downloading FITS tool ... "
37
- system "curl -L #{fits.download_url} -o #{fits_tool}"
38
- # Unzip options: convert text files, force overwrite, extra quiet
39
- system "unzip -a -o -qq -d bin #{fits_tool}"
40
- FileUtils.chmod(0755, fits.path)
41
- end
37
+ fits_config = Ddr::Extraction.config.adapters(:fits)
38
+ puts "Downloading FITS tool ... "
39
+ system "curl -L #{fits_config.download_url} -o #{fits_tool}"
40
+ # Unzip options: convert text files, force overwrite, extra quiet
41
+ system "unzip -a -o -qq -d bin #{fits_tool}"
42
+ FileUtils.chmod(0755, fits_config.path)
42
43
  end
43
44
  end
44
45
 
data/spec/spec_helper.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "ddr-extraction"
1
+ require "ddr/extraction/defaults"
2
2
 
3
3
  # This file was generated by the `rspec --init` command. Conventionally, all
4
4
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ddr-extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Chandek-Stark
@@ -70,14 +70,13 @@ files:
70
70
  - ddr-extraction.gemspec
71
71
  - lib/ddr-extraction.rb
72
72
  - lib/ddr/extraction.rb
73
+ - lib/ddr/extraction/adapter.rb
73
74
  - lib/ddr/extraction/adapters.rb
74
- - lib/ddr/extraction/adapters/adapter.rb
75
75
  - lib/ddr/extraction/adapters/fits_adapter.rb
76
- - lib/ddr/extraction/adapters/metadata_extraction_adapter.rb
77
- - lib/ddr/extraction/adapters/text_extraction_adapter.rb
78
76
  - lib/ddr/extraction/adapters/tika_adapter.rb
77
+ - lib/ddr/extraction/configuration.rb
78
+ - lib/ddr/extraction/defaults.rb
79
79
  - lib/ddr/extraction/extractor.rb
80
- - lib/ddr/extraction/metadata_extractor.rb
81
80
  - lib/ddr/extraction/version.rb
82
81
  - lib/tasks/ddr_extraction.rake
83
82
  - spec/fixtures/blue-devil.png
@@ -1,15 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class Adapter
5
-
6
- class << self
7
- def config
8
- yield self
9
- end
10
- end
11
-
12
- end
13
- end
14
- end
15
- end
@@ -1,13 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class MetadataExtractionAdapter
5
-
6
- def extract_metadata(file)
7
- NotImplemented
8
- end
9
-
10
- end
11
- end
12
- end
13
- end
@@ -1,13 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class TextExtractionAdapter
5
-
6
- def extract_text(file)
7
- NotImplemented
8
- end
9
-
10
- end
11
- end
12
- end
13
- end
@@ -1,16 +0,0 @@
1
- require "ddr/extraction/adapters"
2
-
3
- module Ddr
4
- module Extraction
5
- class MetadataExtractor
6
- extend Forwardable
7
-
8
- def_delegator :@adapter, :extract_metadata
9
-
10
- def initialize
11
- @adapter = Ddr::Extraction::Adapters.get_metadata_extraction_adapter
12
- end
13
-
14
- end
15
- end
16
- end