ddr-extraction 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c5403a9bee4dc10e433bca20f0a8ee11178a99dc
4
- data.tar.gz: d04210fc66bfaa0e8e368fb82f3236c7c2510d38
3
+ metadata.gz: 23d86fadee99408637cfc774cdd62efdaff04e7d
4
+ data.tar.gz: 03b9386486aa83632f5d96fcc65f209b7ad32952
5
5
  SHA512:
6
- metadata.gz: 7e300ee401d7cb26eb2cc259723b3a94009f9c2b4296dfa98870a401a67c4ba4c185fe8a8c5a85444c3db0d55c5a4916ef0d5d91cbdfa00356a904ef1cb2bdd0
7
- data.tar.gz: 6ad6b2c009439ea8b378df9cfc6accd45c3f169b7967bf8047b3864cc7ae91caa1eefc87779aaff6128e1b15b86e2dec7a8d69b4099d2ed278c747a8bce863d4
6
+ metadata.gz: afd68b70ea7ee45bed52c1debfabd88c4ebef23d40d997232c4452061f19ac64d0d2791fc90990807d456e468c660ed553538aaaddf59de731643126e3ab89bb
7
+ data.tar.gz: df8032a81665c2a4592df5a3ff96d3ff8f490dc1c8f16e4f1de482895fe119f48a5dc796e51d0b98aefea78080e1418d2c685a08ba73e94d7df2752322d07098
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
1
  Gemfile.lock
2
2
  bin/tika-*
3
3
  bin/fits-*
4
+ pkg
4
5
  tmp
data/README.md CHANGED
@@ -18,7 +18,65 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO
21
+ ```
22
+ >> extractor = Ddr::Extraction::Extractor.new
23
+ => #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
24
+
25
+ >> text = extractor.extract(:text, "spec/fixtures/sample.docx")
26
+ => #<IO:fd 11>
27
+
28
+ >> puts text.read
29
+ This is a sample document.
30
+
31
+ >> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
32
+ => #<IO:fd 12>
33
+
34
+ >> puts metadata.read
35
+ <?xml version="1.0" encoding="UTF-8"?>
36
+ <fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
37
+ <identification>
38
+ <identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
39
+ <tool toolname="Exiftool" toolversion="9.13" />
40
+ <tool toolname="Droid" toolversion="6.1.3" />
41
+ <tool toolname="ffident" toolversion="0.2" />
42
+ <tool toolname="Tika" toolversion="1.3" />
43
+ <version toolname="Droid" toolversion="6.1.3">1.0</version>
44
+ <externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
45
+ </identity>
46
+ </identification>
47
+ <fileinfo>
48
+ <lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
49
+ <filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
50
+ <filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
51
+ <size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
52
+ <md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
53
+ <fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
54
+ </fileinfo>
55
+ <filestatus />
56
+ <metadata>
57
+ <image>
58
+ <compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
59
+ <compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
60
+ <imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
61
+ <imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
62
+ <orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
63
+ </image>
64
+ </metadata>
65
+ <statistics fitsExecutionTime="791">
66
+ <tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
67
+ <tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
68
+ <tool toolname="Jhove" toolversion="1.5" executionTime="556" />
69
+ <tool toolname="file utility" toolversion="5.04" executionTime="623" />
70
+ <tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
71
+ <tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
72
+ <tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
73
+ <tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
74
+ <tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
75
+ <tool toolname="ffident" toolversion="0.2" executionTime="369" />
76
+ <tool toolname="Tika" toolversion="1.3" executionTime="356" />
77
+ </statistics>
78
+ </fits>
79
+ ```
22
80
 
23
81
  ## Contributing
24
82
 
@@ -1,5 +1,5 @@
1
1
  require "ddr/extraction/version"
2
- require "ddr/extraction/adapters"
2
+ require "ddr/extraction/configuration"
3
3
  require "ddr/extraction/extractor"
4
4
 
5
5
  module Ddr
@@ -10,38 +10,12 @@ module Ddr
10
10
 
11
11
  class << self
12
12
 
13
- attr_accessor :text_adapter, :metadata_adapter
13
+ attr_reader :config
14
14
 
15
- # Yields a configurable object for the named adapter.
16
- #
17
- # @param adapter [Symbol] the name of the adapter - e.g., `:tika`, `:fits`
18
- def configure_adapter(adapter, &block)
19
- yield Adapters.get_adapter(adapter)
20
- end
21
-
22
- def adapters
23
- Adapters.config
24
- end
25
-
26
- def set_defaults
27
- bin_dir = File.expand_path("../../../bin", __FILE__)
28
-
29
- configure_adapter :tika do |tika|
30
- tika.version = "1.6"
31
- tika.path = File.join(bin_dir, "tika-app.jar")
32
- tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika.version}.jar"
33
- tika.checksum = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
34
- tika.checksum_type = :SHA1
35
- end
36
-
37
- configure_adapter :fits do |fits|
38
- fits.version = "0.8.3"
39
- fits.path = File.join(bin_dir, "fits-#{fits.version}", "fits.sh")
40
- fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits.version}.zip"
41
- end
42
-
43
- adapters.text = :tika
44
- adapters.metadata = :fits
15
+ # Yields a configuration object for the service
16
+ def configure
17
+ @config ||= Configuration.new
18
+ yield @config
45
19
  end
46
20
 
47
21
  end
@@ -49,4 +23,3 @@ module Ddr
49
23
  end
50
24
  end
51
25
 
52
- Ddr::Extraction.set_defaults
@@ -0,0 +1,21 @@
1
+ require "delegate"
2
+ require_relative "adapters"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ class Adapter < ::SimpleDelegator
7
+
8
+ class << self
9
+ # Accessors for adapter types
10
+ attr_accessor :text, :metadata
11
+
12
+ def build_adapter(type)
13
+ adapter_name = send(type)
14
+ adapter = Adapters.get_adapter(adapter_name)
15
+ new(adapter.new)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -8,18 +8,6 @@ module Ddr
8
8
  const_get(class_name.to_sym, false)
9
9
  end
10
10
 
11
- def self.build_adapter(type)
12
- adapter_name = config.send(type)
13
- adapter = get_adapter(adapter_name)
14
- adapter.new
15
- end
16
-
17
- Config = Struct.new(:text, :metadata)
18
-
19
- def self.config
20
- @@config ||= Config.new
21
- end
22
-
23
11
  end
24
12
  end
25
13
  end
@@ -1,14 +1,12 @@
1
- require_relative "adapter"
2
-
3
1
  module Ddr
4
2
  module Extraction
5
3
  module Adapters
6
- class FitsAdapter < Adapter
4
+ class FitsAdapter
7
5
 
8
6
  # Return metadata extracted from file
9
7
  #
10
8
  # @param file [String] the file from which to extract metadata.
11
- # @return [IO]
9
+ # @return [IO] the output
12
10
  def extract_metadata(file)
13
11
  IO.popen([self.class.path, "-i", file])
14
12
  end
@@ -1,14 +1,12 @@
1
- require_relative "adapter"
2
-
3
1
  module Ddr
4
2
  module Extraction
5
3
  module Adapters
6
- class TikaAdapter < Adapter
4
+ class TikaAdapter
7
5
 
8
6
  # Extract text from file
9
7
  #
10
8
  # @param file [String] path to file from which to extract text
11
- # @return [IO]
9
+ # @return [IO] the output
12
10
  def extract_text(file)
13
11
  IO.popen(["java", "-jar", self.class.path, "--text", file])
14
12
  end
@@ -26,8 +24,11 @@ module Ddr
26
24
  # URL to download distribution
27
25
  attr_accessor :download_url
28
26
 
27
+ # Verify checksum?
28
+ attr_accessor :verify_checksum
29
+
29
30
  # Tika distribution checksum
30
- attr_accessor :checksum
31
+ attr_accessor :checksum_value
31
32
 
32
33
  # Tika distribution checksum type
33
34
  attr_accessor :checksum_type
@@ -0,0 +1,22 @@
1
+ require_relative "adapter"
2
+ require_relative "adapters"
3
+
4
+ module Ddr
5
+ module Extraction
6
+ class Configuration
7
+
8
+ def adapters(name)
9
+ config = Adapters.get_adapter(name)
10
+ yield config if block_given?
11
+ config
12
+ end
13
+
14
+ def adapter
15
+ config = Adapter
16
+ yield config if block_given?
17
+ config
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ require "ddr-extraction"
2
+
3
+ BIN_DIR = File.expand_path("../../../../bin", __FILE__)
4
+ TIKA_VERSION = "1.6"
5
+ FITS_VERSION = "0.8.3"
6
+
7
+ Ddr::Extraction.configure do |config|
8
+ config.adapter.text = :tika
9
+ config.adapter.metadata = :fits
10
+
11
+ config.adapters(:tika) do |tika|
12
+ tika.version = TIKA_VERSION
13
+ tika.path = File.join(BIN_DIR, "tika-app.jar")
14
+ tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{TIKA_VERSION}.jar"
15
+ tika.verify_checksum = true
16
+ tika.checksum_value = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
17
+ tika.checksum_type = :SHA1
18
+ end
19
+
20
+ config.adapters(:fits) do |fits|
21
+ fits.version = "0.8.3"
22
+ fits.path = File.join(BIN_DIR, "fits-#{FITS_VERSION}", "fits.sh")
23
+ fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{FITS_VERSION}.zip"
24
+ end
25
+ end
26
+
@@ -3,28 +3,20 @@ require_relative "adapters"
3
3
  module Ddr
4
4
  module Extraction
5
5
  class Extractor
6
- extend Forwardable
7
-
8
- def_delegator :text_adapter, :extract_text
9
- def_delegator :metadata_adapter, :extract_metadata
10
6
 
11
7
  # Extracts a type of content from a file
12
8
  #
13
9
  # @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
14
10
  # @param file [String] path to file from which to extract content.
15
- # @return
11
+ # @return [IO] the output
16
12
  def extract(type, file)
17
- send("extract_#{type}", file)
18
- end
13
+ adapter(type).send("extract_#{type}", file)
14
+ end
19
15
 
20
16
  private
21
17
 
22
- def text_adapter
23
- @text_adapter ||= Adapters.build_adapter(:text)
24
- end
25
-
26
- def metadata_adapter
27
- @metadata_adapter ||= Adapters.build_adapter(:metadata)
18
+ def adapter(type)
19
+ Adapter.build_adapter(type)
28
20
  end
29
21
 
30
22
  end
@@ -1,5 +1,5 @@
1
1
  module Ddr
2
2
  module Extraction
3
- VERSION = "0.1.0"
3
+ VERSION = "0.2.0"
4
4
  end
5
5
  end
@@ -1,4 +1,4 @@
1
- require "ddr-extraction"
1
+ require "ddr/extraction/defaults"
2
2
  require "openssl"
3
3
 
4
4
  DOWNLOAD_DIR = File.absolute_path("tmp")
@@ -7,18 +7,20 @@ namespace :tika do
7
7
  desc "Download Tika app"
8
8
  task :download => :download_dir do
9
9
  tika_app = File.join(DOWNLOAD_DIR, "tika-app.jar")
10
- Ddr::Extraction::Adapters::TikaAdapter.config do |tika|
11
- puts "Downloading Tika app ... "
12
- system "curl -L #{tika.download_url} -o #{tika_app}"
10
+ tika_config = Ddr::Extraction.config.adapters(:tika)
11
+ puts "Downloading Tika app ... "
12
+ system "curl -L #{tika_config.download_url} -o #{tika_app}"
13
+ if tika_config.verify_checksum
13
14
  puts "Verifiying checksum ... "
14
- digest = OpenSSL::Digest.const_get(tika.checksum_type).new
15
- digest << File.read(tika.path)
16
- if digest.to_s == tika.checksum
17
- FileUtils.mv(tika_app, tika.path)
18
- else
19
- puts "Checksums do not match!"
15
+ digest = OpenSSL::Digest.const_get(tika_config.checksum_type).new
16
+ digest << File.read(tika_config.path)
17
+ if digest.to_s != tika_config.checksum_value
18
+ puts "Checksums do not match -- aborting!"
19
+ FileUtils.remove_entry_secure(tika_app)
20
+ abort
20
21
  end
21
22
  end
23
+ FileUtils.mv(tika_app, tika_config.path)
22
24
  end
23
25
 
24
26
  # namespace :server do
@@ -32,13 +34,12 @@ namespace :fits do
32
34
  desc "Download FITS tool"
33
35
  task :download => :download_dir do
34
36
  fits_tool = File.join(DOWNLOAD_DIR, "fits.zip")
35
- Ddr::Extraction::Adapters::FitsAdapter.config do |fits|
36
- puts "Downloading FITS tool ... "
37
- system "curl -L #{fits.download_url} -o #{fits_tool}"
38
- # Unzip options: convert text files, force overwrite, extra quiet
39
- system "unzip -a -o -qq -d bin #{fits_tool}"
40
- FileUtils.chmod(0755, fits.path)
41
- end
37
+ fits_config = Ddr::Extraction.config.adapters(:fits)
38
+ puts "Downloading FITS tool ... "
39
+ system "curl -L #{fits_config.download_url} -o #{fits_tool}"
40
+ # Unzip options: convert text files, force overwrite, extra quiet
41
+ system "unzip -a -o -qq -d bin #{fits_tool}"
42
+ FileUtils.chmod(0755, fits_config.path)
42
43
  end
43
44
  end
44
45
 
data/spec/spec_helper.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "ddr-extraction"
1
+ require "ddr/extraction/defaults"
2
2
 
3
3
  # This file was generated by the `rspec --init` command. Conventionally, all
4
4
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ddr-extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Chandek-Stark
@@ -70,14 +70,13 @@ files:
70
70
  - ddr-extraction.gemspec
71
71
  - lib/ddr-extraction.rb
72
72
  - lib/ddr/extraction.rb
73
+ - lib/ddr/extraction/adapter.rb
73
74
  - lib/ddr/extraction/adapters.rb
74
- - lib/ddr/extraction/adapters/adapter.rb
75
75
  - lib/ddr/extraction/adapters/fits_adapter.rb
76
- - lib/ddr/extraction/adapters/metadata_extraction_adapter.rb
77
- - lib/ddr/extraction/adapters/text_extraction_adapter.rb
78
76
  - lib/ddr/extraction/adapters/tika_adapter.rb
77
+ - lib/ddr/extraction/configuration.rb
78
+ - lib/ddr/extraction/defaults.rb
79
79
  - lib/ddr/extraction/extractor.rb
80
- - lib/ddr/extraction/metadata_extractor.rb
81
80
  - lib/ddr/extraction/version.rb
82
81
  - lib/tasks/ddr_extraction.rake
83
82
  - spec/fixtures/blue-devil.png
@@ -1,15 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class Adapter
5
-
6
- class << self
7
- def config
8
- yield self
9
- end
10
- end
11
-
12
- end
13
- end
14
- end
15
- end
@@ -1,13 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class MetadataExtractionAdapter
5
-
6
- def extract_metadata(file)
7
- NotImplemented
8
- end
9
-
10
- end
11
- end
12
- end
13
- end
@@ -1,13 +0,0 @@
1
- module Ddr
2
- module Extraction
3
- module Adapters
4
- class TextExtractionAdapter
5
-
6
- def extract_text(file)
7
- NotImplemented
8
- end
9
-
10
- end
11
- end
12
- end
13
- end
@@ -1,16 +0,0 @@
1
- require "ddr/extraction/adapters"
2
-
3
- module Ddr
4
- module Extraction
5
- class MetadataExtractor
6
- extend Forwardable
7
-
8
- def_delegator :@adapter, :extract_metadata
9
-
10
- def initialize
11
- @adapter = Ddr::Extraction::Adapters.get_metadata_extraction_adapter
12
- end
13
-
14
- end
15
- end
16
- end