ddr-extraction 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +59 -1
- data/lib/ddr/extraction.rb +6 -33
- data/lib/ddr/extraction/adapter.rb +21 -0
- data/lib/ddr/extraction/adapters.rb +0 -12
- data/lib/ddr/extraction/adapters/fits_adapter.rb +2 -4
- data/lib/ddr/extraction/adapters/tika_adapter.rb +6 -5
- data/lib/ddr/extraction/configuration.rb +22 -0
- data/lib/ddr/extraction/defaults.rb +26 -0
- data/lib/ddr/extraction/extractor.rb +5 -13
- data/lib/ddr/extraction/version.rb +1 -1
- data/lib/tasks/ddr_extraction.rake +18 -17
- data/spec/spec_helper.rb +1 -1
- metadata +4 -5
- data/lib/ddr/extraction/adapters/adapter.rb +0 -15
- data/lib/ddr/extraction/adapters/metadata_extraction_adapter.rb +0 -13
- data/lib/ddr/extraction/adapters/text_extraction_adapter.rb +0 -13
- data/lib/ddr/extraction/metadata_extractor.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 23d86fadee99408637cfc774cdd62efdaff04e7d
|
4
|
+
data.tar.gz: 03b9386486aa83632f5d96fcc65f209b7ad32952
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afd68b70ea7ee45bed52c1debfabd88c4ebef23d40d997232c4452061f19ac64d0d2791fc90990807d456e468c660ed553538aaaddf59de731643126e3ab89bb
|
7
|
+
data.tar.gz: df8032a81665c2a4592df5a3ff96d3ff8f490dc1c8f16e4f1de482895fe119f48a5dc796e51d0b98aefea78080e1418d2c685a08ba73e94d7df2752322d07098
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,65 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
```
|
22
|
+
>> extractor = Ddr::Extraction::Extractor.new
|
23
|
+
=> #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
|
24
|
+
|
25
|
+
>> text = extractor.extract(:text, "spec/fixtures/sample.docx")
|
26
|
+
=> #<IO:fd 11>
|
27
|
+
|
28
|
+
>> puts text.read
|
29
|
+
This is a sample document.
|
30
|
+
|
31
|
+
>> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
|
32
|
+
=> #<IO:fd 12>
|
33
|
+
|
34
|
+
>> puts metadata.read
|
35
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
36
|
+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
|
37
|
+
<identification>
|
38
|
+
<identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
|
39
|
+
<tool toolname="Exiftool" toolversion="9.13" />
|
40
|
+
<tool toolname="Droid" toolversion="6.1.3" />
|
41
|
+
<tool toolname="ffident" toolversion="0.2" />
|
42
|
+
<tool toolname="Tika" toolversion="1.3" />
|
43
|
+
<version toolname="Droid" toolversion="6.1.3">1.0</version>
|
44
|
+
<externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
|
45
|
+
</identity>
|
46
|
+
</identification>
|
47
|
+
<fileinfo>
|
48
|
+
<lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
|
49
|
+
<filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
|
50
|
+
<filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
|
51
|
+
<size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
|
52
|
+
<md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
|
53
|
+
<fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
|
54
|
+
</fileinfo>
|
55
|
+
<filestatus />
|
56
|
+
<metadata>
|
57
|
+
<image>
|
58
|
+
<compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
|
59
|
+
<compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
|
60
|
+
<imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
|
61
|
+
<imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
|
62
|
+
<orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
|
63
|
+
</image>
|
64
|
+
</metadata>
|
65
|
+
<statistics fitsExecutionTime="791">
|
66
|
+
<tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
|
67
|
+
<tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
|
68
|
+
<tool toolname="Jhove" toolversion="1.5" executionTime="556" />
|
69
|
+
<tool toolname="file utility" toolversion="5.04" executionTime="623" />
|
70
|
+
<tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
|
71
|
+
<tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
|
72
|
+
<tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
|
73
|
+
<tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
|
74
|
+
<tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
|
75
|
+
<tool toolname="ffident" toolversion="0.2" executionTime="369" />
|
76
|
+
<tool toolname="Tika" toolversion="1.3" executionTime="356" />
|
77
|
+
</statistics>
|
78
|
+
</fits>
|
79
|
+
```
|
22
80
|
|
23
81
|
## Contributing
|
24
82
|
|
data/lib/ddr/extraction.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require "ddr/extraction/version"
|
2
|
-
require "ddr/extraction/
|
2
|
+
require "ddr/extraction/configuration"
|
3
3
|
require "ddr/extraction/extractor"
|
4
4
|
|
5
5
|
module Ddr
|
@@ -10,38 +10,12 @@ module Ddr
|
|
10
10
|
|
11
11
|
class << self
|
12
12
|
|
13
|
-
|
13
|
+
attr_reader :config
|
14
14
|
|
15
|
-
# Yields a
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
yield Adapters.get_adapter(adapter)
|
20
|
-
end
|
21
|
-
|
22
|
-
def adapters
|
23
|
-
Adapters.config
|
24
|
-
end
|
25
|
-
|
26
|
-
def set_defaults
|
27
|
-
bin_dir = File.expand_path("../../../bin", __FILE__)
|
28
|
-
|
29
|
-
configure_adapter :tika do |tika|
|
30
|
-
tika.version = "1.6"
|
31
|
-
tika.path = File.join(bin_dir, "tika-app.jar")
|
32
|
-
tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika.version}.jar"
|
33
|
-
tika.checksum = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
|
34
|
-
tika.checksum_type = :SHA1
|
35
|
-
end
|
36
|
-
|
37
|
-
configure_adapter :fits do |fits|
|
38
|
-
fits.version = "0.8.3"
|
39
|
-
fits.path = File.join(bin_dir, "fits-#{fits.version}", "fits.sh")
|
40
|
-
fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits.version}.zip"
|
41
|
-
end
|
42
|
-
|
43
|
-
adapters.text = :tika
|
44
|
-
adapters.metadata = :fits
|
15
|
+
# Yields a configuration object for the service
|
16
|
+
def configure
|
17
|
+
@config ||= Configuration.new
|
18
|
+
yield @config
|
45
19
|
end
|
46
20
|
|
47
21
|
end
|
@@ -49,4 +23,3 @@ module Ddr
|
|
49
23
|
end
|
50
24
|
end
|
51
25
|
|
52
|
-
Ddr::Extraction.set_defaults
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "delegate"
|
2
|
+
require_relative "adapters"
|
3
|
+
|
4
|
+
module Ddr
|
5
|
+
module Extraction
|
6
|
+
class Adapter < ::SimpleDelegator
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# Accessors for adapter types
|
10
|
+
attr_accessor :text, :metadata
|
11
|
+
|
12
|
+
def build_adapter(type)
|
13
|
+
adapter_name = send(type)
|
14
|
+
adapter = Adapters.get_adapter(adapter_name)
|
15
|
+
new(adapter.new)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -8,18 +8,6 @@ module Ddr
|
|
8
8
|
const_get(class_name.to_sym, false)
|
9
9
|
end
|
10
10
|
|
11
|
-
def self.build_adapter(type)
|
12
|
-
adapter_name = config.send(type)
|
13
|
-
adapter = get_adapter(adapter_name)
|
14
|
-
adapter.new
|
15
|
-
end
|
16
|
-
|
17
|
-
Config = Struct.new(:text, :metadata)
|
18
|
-
|
19
|
-
def self.config
|
20
|
-
@@config ||= Config.new
|
21
|
-
end
|
22
|
-
|
23
11
|
end
|
24
12
|
end
|
25
13
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
require_relative "adapter"
|
2
|
-
|
3
1
|
module Ddr
|
4
2
|
module Extraction
|
5
3
|
module Adapters
|
6
|
-
class FitsAdapter
|
4
|
+
class FitsAdapter
|
7
5
|
|
8
6
|
# Return metadata extracted from file
|
9
7
|
#
|
10
8
|
# @param file [String] the file from which to extract metadata.
|
11
|
-
# @return [IO]
|
9
|
+
# @return [IO] the output
|
12
10
|
def extract_metadata(file)
|
13
11
|
IO.popen([self.class.path, "-i", file])
|
14
12
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
require_relative "adapter"
|
2
|
-
|
3
1
|
module Ddr
|
4
2
|
module Extraction
|
5
3
|
module Adapters
|
6
|
-
class TikaAdapter
|
4
|
+
class TikaAdapter
|
7
5
|
|
8
6
|
# Extract text from file
|
9
7
|
#
|
10
8
|
# @param file [String] path to file from which to extract text
|
11
|
-
# @return [IO]
|
9
|
+
# @return [IO] the output
|
12
10
|
def extract_text(file)
|
13
11
|
IO.popen(["java", "-jar", self.class.path, "--text", file])
|
14
12
|
end
|
@@ -26,8 +24,11 @@ module Ddr
|
|
26
24
|
# URL to download distribution
|
27
25
|
attr_accessor :download_url
|
28
26
|
|
27
|
+
# Verify checksum?
|
28
|
+
attr_accessor :verify_checksum
|
29
|
+
|
29
30
|
# Tika distribution checksum
|
30
|
-
attr_accessor :
|
31
|
+
attr_accessor :checksum_value
|
31
32
|
|
32
33
|
# Tika distribution checksum type
|
33
34
|
attr_accessor :checksum_type
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require_relative "adapter"
|
2
|
+
require_relative "adapters"
|
3
|
+
|
4
|
+
module Ddr
|
5
|
+
module Extraction
|
6
|
+
class Configuration
|
7
|
+
|
8
|
+
def adapters(name)
|
9
|
+
config = Adapters.get_adapter(name)
|
10
|
+
yield config if block_given?
|
11
|
+
config
|
12
|
+
end
|
13
|
+
|
14
|
+
def adapter
|
15
|
+
config = Adapter
|
16
|
+
yield config if block_given?
|
17
|
+
config
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "ddr-extraction"
|
2
|
+
|
3
|
+
BIN_DIR = File.expand_path("../../../../bin", __FILE__)
|
4
|
+
TIKA_VERSION = "1.6"
|
5
|
+
FITS_VERSION = "0.8.3"
|
6
|
+
|
7
|
+
Ddr::Extraction.configure do |config|
|
8
|
+
config.adapter.text = :tika
|
9
|
+
config.adapter.metadata = :fits
|
10
|
+
|
11
|
+
config.adapters(:tika) do |tika|
|
12
|
+
tika.version = TIKA_VERSION
|
13
|
+
tika.path = File.join(BIN_DIR, "tika-app.jar")
|
14
|
+
tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{TIKA_VERSION}.jar"
|
15
|
+
tika.verify_checksum = true
|
16
|
+
tika.checksum_value = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
|
17
|
+
tika.checksum_type = :SHA1
|
18
|
+
end
|
19
|
+
|
20
|
+
config.adapters(:fits) do |fits|
|
21
|
+
fits.version = "0.8.3"
|
22
|
+
fits.path = File.join(BIN_DIR, "fits-#{FITS_VERSION}", "fits.sh")
|
23
|
+
fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{FITS_VERSION}.zip"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -3,28 +3,20 @@ require_relative "adapters"
|
|
3
3
|
module Ddr
|
4
4
|
module Extraction
|
5
5
|
class Extractor
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegator :text_adapter, :extract_text
|
9
|
-
def_delegator :metadata_adapter, :extract_metadata
|
10
6
|
|
11
7
|
# Extracts a type of content from a file
|
12
8
|
#
|
13
9
|
# @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
|
14
10
|
# @param file [String] path to file from which to extract content.
|
15
|
-
# @return
|
11
|
+
# @return [IO] the output
|
16
12
|
def extract(type, file)
|
17
|
-
send("extract_#{type}", file)
|
18
|
-
end
|
13
|
+
adapter(type).send("extract_#{type}", file)
|
14
|
+
end
|
19
15
|
|
20
16
|
private
|
21
17
|
|
22
|
-
def
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def metadata_adapter
|
27
|
-
@metadata_adapter ||= Adapters.build_adapter(:metadata)
|
18
|
+
def adapter(type)
|
19
|
+
Adapter.build_adapter(type)
|
28
20
|
end
|
29
21
|
|
30
22
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require "ddr
|
1
|
+
require "ddr/extraction/defaults"
|
2
2
|
require "openssl"
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = File.absolute_path("tmp")
|
@@ -7,18 +7,20 @@ namespace :tika do
|
|
7
7
|
desc "Download Tika app"
|
8
8
|
task :download => :download_dir do
|
9
9
|
tika_app = File.join(DOWNLOAD_DIR, "tika-app.jar")
|
10
|
-
Ddr::Extraction
|
11
|
-
|
12
|
-
|
10
|
+
tika_config = Ddr::Extraction.config.adapters(:tika)
|
11
|
+
puts "Downloading Tika app ... "
|
12
|
+
system "curl -L #{tika_config.download_url} -o #{tika_app}"
|
13
|
+
if tika_config.verify_checksum
|
13
14
|
puts "Verifiying checksum ... "
|
14
|
-
digest = OpenSSL::Digest.const_get(
|
15
|
-
digest << File.read(
|
16
|
-
if digest.to_s
|
17
|
-
|
18
|
-
|
19
|
-
|
15
|
+
digest = OpenSSL::Digest.const_get(tika_config.checksum_type).new
|
16
|
+
digest << File.read(tika_config.path)
|
17
|
+
if digest.to_s != tika_config.checksum_value
|
18
|
+
puts "Checksums do not match -- aborting!"
|
19
|
+
FileUtils.remove_entry_secure(tika_app)
|
20
|
+
abort
|
20
21
|
end
|
21
22
|
end
|
23
|
+
FileUtils.mv(tika_app, tika_config.path)
|
22
24
|
end
|
23
25
|
|
24
26
|
# namespace :server do
|
@@ -32,13 +34,12 @@ namespace :fits do
|
|
32
34
|
desc "Download FITS tool"
|
33
35
|
task :download => :download_dir do
|
34
36
|
fits_tool = File.join(DOWNLOAD_DIR, "fits.zip")
|
35
|
-
Ddr::Extraction
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
37
|
+
fits_config = Ddr::Extraction.config.adapters(:fits)
|
38
|
+
puts "Downloading FITS tool ... "
|
39
|
+
system "curl -L #{fits_config.download_url} -o #{fits_tool}"
|
40
|
+
# Unzip options: convert text files, force overwrite, extra quiet
|
41
|
+
system "unzip -a -o -qq -d bin #{fits_tool}"
|
42
|
+
FileUtils.chmod(0755, fits_config.path)
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ddr-extraction
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Chandek-Stark
|
@@ -70,14 +70,13 @@ files:
|
|
70
70
|
- ddr-extraction.gemspec
|
71
71
|
- lib/ddr-extraction.rb
|
72
72
|
- lib/ddr/extraction.rb
|
73
|
+
- lib/ddr/extraction/adapter.rb
|
73
74
|
- lib/ddr/extraction/adapters.rb
|
74
|
-
- lib/ddr/extraction/adapters/adapter.rb
|
75
75
|
- lib/ddr/extraction/adapters/fits_adapter.rb
|
76
|
-
- lib/ddr/extraction/adapters/metadata_extraction_adapter.rb
|
77
|
-
- lib/ddr/extraction/adapters/text_extraction_adapter.rb
|
78
76
|
- lib/ddr/extraction/adapters/tika_adapter.rb
|
77
|
+
- lib/ddr/extraction/configuration.rb
|
78
|
+
- lib/ddr/extraction/defaults.rb
|
79
79
|
- lib/ddr/extraction/extractor.rb
|
80
|
-
- lib/ddr/extraction/metadata_extractor.rb
|
81
80
|
- lib/ddr/extraction/version.rb
|
82
81
|
- lib/tasks/ddr_extraction.rake
|
83
82
|
- spec/fixtures/blue-devil.png
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require "ddr/extraction/adapters"
|
2
|
-
|
3
|
-
module Ddr
|
4
|
-
module Extraction
|
5
|
-
class MetadataExtractor
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegator :@adapter, :extract_metadata
|
9
|
-
|
10
|
-
def initialize
|
11
|
-
@adapter = Ddr::Extraction::Adapters.get_metadata_extraction_adapter
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|