ddr-extraction 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +59 -1
- data/lib/ddr/extraction.rb +6 -33
- data/lib/ddr/extraction/adapter.rb +21 -0
- data/lib/ddr/extraction/adapters.rb +0 -12
- data/lib/ddr/extraction/adapters/fits_adapter.rb +2 -4
- data/lib/ddr/extraction/adapters/tika_adapter.rb +6 -5
- data/lib/ddr/extraction/configuration.rb +22 -0
- data/lib/ddr/extraction/defaults.rb +26 -0
- data/lib/ddr/extraction/extractor.rb +5 -13
- data/lib/ddr/extraction/version.rb +1 -1
- data/lib/tasks/ddr_extraction.rake +18 -17
- data/spec/spec_helper.rb +1 -1
- metadata +4 -5
- data/lib/ddr/extraction/adapters/adapter.rb +0 -15
- data/lib/ddr/extraction/adapters/metadata_extraction_adapter.rb +0 -13
- data/lib/ddr/extraction/adapters/text_extraction_adapter.rb +0 -13
- data/lib/ddr/extraction/metadata_extractor.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 23d86fadee99408637cfc774cdd62efdaff04e7d
|
4
|
+
data.tar.gz: 03b9386486aa83632f5d96fcc65f209b7ad32952
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afd68b70ea7ee45bed52c1debfabd88c4ebef23d40d997232c4452061f19ac64d0d2791fc90990807d456e468c660ed553538aaaddf59de731643126e3ab89bb
|
7
|
+
data.tar.gz: df8032a81665c2a4592df5a3ff96d3ff8f490dc1c8f16e4f1de482895fe119f48a5dc796e51d0b98aefea78080e1418d2c685a08ba73e94d7df2752322d07098
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,65 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
```
|
22
|
+
>> extractor = Ddr::Extraction::Extractor.new
|
23
|
+
=> #<Ddr::Extraction::Extractor:0x007fc2851dcfa0>
|
24
|
+
|
25
|
+
>> text = extractor.extract(:text, "spec/fixtures/sample.docx")
|
26
|
+
=> #<IO:fd 11>
|
27
|
+
|
28
|
+
>> puts text.read
|
29
|
+
This is a sample document.
|
30
|
+
|
31
|
+
>> metadata = extractor.extract(:metadata, "spec/fixtures/blue-devil.png")
|
32
|
+
=> #<IO:fd 12>
|
33
|
+
|
34
|
+
>> puts metadata.read
|
35
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
36
|
+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="0.8.3" timestamp="11/12/14 12:36 PM">
|
37
|
+
<identification>
|
38
|
+
<identity format="Portable Network Graphics" mimetype="image/png" toolname="FITS" toolversion="0.8.3">
|
39
|
+
<tool toolname="Exiftool" toolversion="9.13" />
|
40
|
+
<tool toolname="Droid" toolversion="6.1.3" />
|
41
|
+
<tool toolname="ffident" toolversion="0.2" />
|
42
|
+
<tool toolname="Tika" toolversion="1.3" />
|
43
|
+
<version toolname="Droid" toolversion="6.1.3">1.0</version>
|
44
|
+
<externalIdentifier toolname="Droid" toolversion="6.1.3" type="puid">fmt/11</externalIdentifier>
|
45
|
+
</identity>
|
46
|
+
</identification>
|
47
|
+
<fileinfo>
|
48
|
+
<lastmodified toolname="Exiftool" toolversion="9.13" status="SINGLE_RESULT">2014:11:12 12:24:18-05:00</lastmodified>
|
49
|
+
<filepath toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">/path/to/spec/fixtures/blue-devil.png</filepath>
|
50
|
+
<filename toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">blue-devil.png</filename>
|
51
|
+
<size toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">75005</size>
|
52
|
+
<md5checksum toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">e6a5d16da2fbe65311952e2d8b04f069</md5checksum>
|
53
|
+
<fslastmodified toolname="OIS File Information" toolversion="0.2" status="SINGLE_RESULT">1415813058000</fslastmodified>
|
54
|
+
</fileinfo>
|
55
|
+
<filestatus />
|
56
|
+
<metadata>
|
57
|
+
<image>
|
58
|
+
<compressionScheme toolname="Exiftool" toolversion="9.13" status="CONFLICT">Deflate/Inflate</compressionScheme>
|
59
|
+
<compressionScheme toolname="Tika" toolversion="1.3" status="CONFLICT">Deflate</compressionScheme>
|
60
|
+
<imageWidth toolname="Exiftool" toolversion="9.13">200</imageWidth>
|
61
|
+
<imageHeight toolname="Exiftool" toolversion="9.13">200</imageHeight>
|
62
|
+
<orientation toolname="Tika" toolversion="1.3" status="SINGLE_RESULT">normal*</orientation>
|
63
|
+
</image>
|
64
|
+
</metadata>
|
65
|
+
<statistics fitsExecutionTime="791">
|
66
|
+
<tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
|
67
|
+
<tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
|
68
|
+
<tool toolname="Jhove" toolversion="1.5" executionTime="556" />
|
69
|
+
<tool toolname="file utility" toolversion="5.04" executionTime="623" />
|
70
|
+
<tool toolname="Exiftool" toolversion="9.13" executionTime="664" />
|
71
|
+
<tool toolname="Droid" toolversion="6.1.3" executionTime="147" />
|
72
|
+
<tool toolname="NLNZ Metadata Extractor" toolversion="3.4GA" executionTime="366" />
|
73
|
+
<tool toolname="OIS File Information" toolversion="0.2" executionTime="142" />
|
74
|
+
<tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
|
75
|
+
<tool toolname="ffident" toolversion="0.2" executionTime="369" />
|
76
|
+
<tool toolname="Tika" toolversion="1.3" executionTime="356" />
|
77
|
+
</statistics>
|
78
|
+
</fits>
|
79
|
+
```
|
22
80
|
|
23
81
|
## Contributing
|
24
82
|
|
data/lib/ddr/extraction.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require "ddr/extraction/version"
|
2
|
-
require "ddr/extraction/
|
2
|
+
require "ddr/extraction/configuration"
|
3
3
|
require "ddr/extraction/extractor"
|
4
4
|
|
5
5
|
module Ddr
|
@@ -10,38 +10,12 @@ module Ddr
|
|
10
10
|
|
11
11
|
class << self
|
12
12
|
|
13
|
-
|
13
|
+
attr_reader :config
|
14
14
|
|
15
|
-
# Yields a
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
yield Adapters.get_adapter(adapter)
|
20
|
-
end
|
21
|
-
|
22
|
-
def adapters
|
23
|
-
Adapters.config
|
24
|
-
end
|
25
|
-
|
26
|
-
def set_defaults
|
27
|
-
bin_dir = File.expand_path("../../../bin", __FILE__)
|
28
|
-
|
29
|
-
configure_adapter :tika do |tika|
|
30
|
-
tika.version = "1.6"
|
31
|
-
tika.path = File.join(bin_dir, "tika-app.jar")
|
32
|
-
tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{tika.version}.jar"
|
33
|
-
tika.checksum = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
|
34
|
-
tika.checksum_type = :SHA1
|
35
|
-
end
|
36
|
-
|
37
|
-
configure_adapter :fits do |fits|
|
38
|
-
fits.version = "0.8.3"
|
39
|
-
fits.path = File.join(bin_dir, "fits-#{fits.version}", "fits.sh")
|
40
|
-
fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{fits.version}.zip"
|
41
|
-
end
|
42
|
-
|
43
|
-
adapters.text = :tika
|
44
|
-
adapters.metadata = :fits
|
15
|
+
# Yields a configuration object for the service
|
16
|
+
def configure
|
17
|
+
@config ||= Configuration.new
|
18
|
+
yield @config
|
45
19
|
end
|
46
20
|
|
47
21
|
end
|
@@ -49,4 +23,3 @@ module Ddr
|
|
49
23
|
end
|
50
24
|
end
|
51
25
|
|
52
|
-
Ddr::Extraction.set_defaults
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "delegate"
|
2
|
+
require_relative "adapters"
|
3
|
+
|
4
|
+
module Ddr
|
5
|
+
module Extraction
|
6
|
+
class Adapter < ::SimpleDelegator
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# Accessors for adapter types
|
10
|
+
attr_accessor :text, :metadata
|
11
|
+
|
12
|
+
def build_adapter(type)
|
13
|
+
adapter_name = send(type)
|
14
|
+
adapter = Adapters.get_adapter(adapter_name)
|
15
|
+
new(adapter.new)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -8,18 +8,6 @@ module Ddr
|
|
8
8
|
const_get(class_name.to_sym, false)
|
9
9
|
end
|
10
10
|
|
11
|
-
def self.build_adapter(type)
|
12
|
-
adapter_name = config.send(type)
|
13
|
-
adapter = get_adapter(adapter_name)
|
14
|
-
adapter.new
|
15
|
-
end
|
16
|
-
|
17
|
-
Config = Struct.new(:text, :metadata)
|
18
|
-
|
19
|
-
def self.config
|
20
|
-
@@config ||= Config.new
|
21
|
-
end
|
22
|
-
|
23
11
|
end
|
24
12
|
end
|
25
13
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
require_relative "adapter"
|
2
|
-
|
3
1
|
module Ddr
|
4
2
|
module Extraction
|
5
3
|
module Adapters
|
6
|
-
class FitsAdapter
|
4
|
+
class FitsAdapter
|
7
5
|
|
8
6
|
# Return metadata extracted from file
|
9
7
|
#
|
10
8
|
# @param file [String] the file from which to extract metadata.
|
11
|
-
# @return [IO]
|
9
|
+
# @return [IO] the output
|
12
10
|
def extract_metadata(file)
|
13
11
|
IO.popen([self.class.path, "-i", file])
|
14
12
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
require_relative "adapter"
|
2
|
-
|
3
1
|
module Ddr
|
4
2
|
module Extraction
|
5
3
|
module Adapters
|
6
|
-
class TikaAdapter
|
4
|
+
class TikaAdapter
|
7
5
|
|
8
6
|
# Extract text from file
|
9
7
|
#
|
10
8
|
# @param file [String] path to file from which to extract text
|
11
|
-
# @return [IO]
|
9
|
+
# @return [IO] the output
|
12
10
|
def extract_text(file)
|
13
11
|
IO.popen(["java", "-jar", self.class.path, "--text", file])
|
14
12
|
end
|
@@ -26,8 +24,11 @@ module Ddr
|
|
26
24
|
# URL to download distribution
|
27
25
|
attr_accessor :download_url
|
28
26
|
|
27
|
+
# Verify checksum?
|
28
|
+
attr_accessor :verify_checksum
|
29
|
+
|
29
30
|
# Tika distribution checksum
|
30
|
-
attr_accessor :
|
31
|
+
attr_accessor :checksum_value
|
31
32
|
|
32
33
|
# Tika distribution checksum type
|
33
34
|
attr_accessor :checksum_type
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require_relative "adapter"
|
2
|
+
require_relative "adapters"
|
3
|
+
|
4
|
+
module Ddr
|
5
|
+
module Extraction
|
6
|
+
class Configuration
|
7
|
+
|
8
|
+
def adapters(name)
|
9
|
+
config = Adapters.get_adapter(name)
|
10
|
+
yield config if block_given?
|
11
|
+
config
|
12
|
+
end
|
13
|
+
|
14
|
+
def adapter
|
15
|
+
config = Adapter
|
16
|
+
yield config if block_given?
|
17
|
+
config
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "ddr-extraction"
|
2
|
+
|
3
|
+
BIN_DIR = File.expand_path("../../../../bin", __FILE__)
|
4
|
+
TIKA_VERSION = "1.6"
|
5
|
+
FITS_VERSION = "0.8.3"
|
6
|
+
|
7
|
+
Ddr::Extraction.configure do |config|
|
8
|
+
config.adapter.text = :tika
|
9
|
+
config.adapter.metadata = :fits
|
10
|
+
|
11
|
+
config.adapters(:tika) do |tika|
|
12
|
+
tika.version = TIKA_VERSION
|
13
|
+
tika.path = File.join(BIN_DIR, "tika-app.jar")
|
14
|
+
tika.download_url = "http://archive.apache.org/dist/tika/tika-app-#{TIKA_VERSION}.jar"
|
15
|
+
tika.verify_checksum = true
|
16
|
+
tika.checksum_value = "99df0d8c3f6a2be498d275053e611fb5afdf0a9d"
|
17
|
+
tika.checksum_type = :SHA1
|
18
|
+
end
|
19
|
+
|
20
|
+
config.adapters(:fits) do |fits|
|
21
|
+
fits.version = "0.8.3"
|
22
|
+
fits.path = File.join(BIN_DIR, "fits-#{FITS_VERSION}", "fits.sh")
|
23
|
+
fits.download_url = "http://projects.iq.harvard.edu/files/fits/files/fits-#{FITS_VERSION}.zip"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -3,28 +3,20 @@ require_relative "adapters"
|
|
3
3
|
module Ddr
|
4
4
|
module Extraction
|
5
5
|
class Extractor
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegator :text_adapter, :extract_text
|
9
|
-
def_delegator :metadata_adapter, :extract_metadata
|
10
6
|
|
11
7
|
# Extracts a type of content from a file
|
12
8
|
#
|
13
9
|
# @param type [Symbol] the type of content to extract, `:text` or `:metadata`.
|
14
10
|
# @param file [String] path to file from which to extract content.
|
15
|
-
# @return
|
11
|
+
# @return [IO] the output
|
16
12
|
def extract(type, file)
|
17
|
-
send("extract_#{type}", file)
|
18
|
-
end
|
13
|
+
adapter(type).send("extract_#{type}", file)
|
14
|
+
end
|
19
15
|
|
20
16
|
private
|
21
17
|
|
22
|
-
def
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def metadata_adapter
|
27
|
-
@metadata_adapter ||= Adapters.build_adapter(:metadata)
|
18
|
+
def adapter(type)
|
19
|
+
Adapter.build_adapter(type)
|
28
20
|
end
|
29
21
|
|
30
22
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require "ddr
|
1
|
+
require "ddr/extraction/defaults"
|
2
2
|
require "openssl"
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = File.absolute_path("tmp")
|
@@ -7,18 +7,20 @@ namespace :tika do
|
|
7
7
|
desc "Download Tika app"
|
8
8
|
task :download => :download_dir do
|
9
9
|
tika_app = File.join(DOWNLOAD_DIR, "tika-app.jar")
|
10
|
-
Ddr::Extraction
|
11
|
-
|
12
|
-
|
10
|
+
tika_config = Ddr::Extraction.config.adapters(:tika)
|
11
|
+
puts "Downloading Tika app ... "
|
12
|
+
system "curl -L #{tika_config.download_url} -o #{tika_app}"
|
13
|
+
if tika_config.verify_checksum
|
13
14
|
puts "Verifiying checksum ... "
|
14
|
-
digest = OpenSSL::Digest.const_get(
|
15
|
-
digest << File.read(
|
16
|
-
if digest.to_s
|
17
|
-
|
18
|
-
|
19
|
-
|
15
|
+
digest = OpenSSL::Digest.const_get(tika_config.checksum_type).new
|
16
|
+
digest << File.read(tika_config.path)
|
17
|
+
if digest.to_s != tika_config.checksum_value
|
18
|
+
puts "Checksums do not match -- aborting!"
|
19
|
+
FileUtils.remove_entry_secure(tika_app)
|
20
|
+
abort
|
20
21
|
end
|
21
22
|
end
|
23
|
+
FileUtils.mv(tika_app, tika_config.path)
|
22
24
|
end
|
23
25
|
|
24
26
|
# namespace :server do
|
@@ -32,13 +34,12 @@ namespace :fits do
|
|
32
34
|
desc "Download FITS tool"
|
33
35
|
task :download => :download_dir do
|
34
36
|
fits_tool = File.join(DOWNLOAD_DIR, "fits.zip")
|
35
|
-
Ddr::Extraction
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
37
|
+
fits_config = Ddr::Extraction.config.adapters(:fits)
|
38
|
+
puts "Downloading FITS tool ... "
|
39
|
+
system "curl -L #{fits_config.download_url} -o #{fits_tool}"
|
40
|
+
# Unzip options: convert text files, force overwrite, extra quiet
|
41
|
+
system "unzip -a -o -qq -d bin #{fits_tool}"
|
42
|
+
FileUtils.chmod(0755, fits_config.path)
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ddr-extraction
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Chandek-Stark
|
@@ -70,14 +70,13 @@ files:
|
|
70
70
|
- ddr-extraction.gemspec
|
71
71
|
- lib/ddr-extraction.rb
|
72
72
|
- lib/ddr/extraction.rb
|
73
|
+
- lib/ddr/extraction/adapter.rb
|
73
74
|
- lib/ddr/extraction/adapters.rb
|
74
|
-
- lib/ddr/extraction/adapters/adapter.rb
|
75
75
|
- lib/ddr/extraction/adapters/fits_adapter.rb
|
76
|
-
- lib/ddr/extraction/adapters/metadata_extraction_adapter.rb
|
77
|
-
- lib/ddr/extraction/adapters/text_extraction_adapter.rb
|
78
76
|
- lib/ddr/extraction/adapters/tika_adapter.rb
|
77
|
+
- lib/ddr/extraction/configuration.rb
|
78
|
+
- lib/ddr/extraction/defaults.rb
|
79
79
|
- lib/ddr/extraction/extractor.rb
|
80
|
-
- lib/ddr/extraction/metadata_extractor.rb
|
81
80
|
- lib/ddr/extraction/version.rb
|
82
81
|
- lib/tasks/ddr_extraction.rake
|
83
82
|
- spec/fixtures/blue-devil.png
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require "ddr/extraction/adapters"
|
2
|
-
|
3
|
-
module Ddr
|
4
|
-
module Extraction
|
5
|
-
class MetadataExtractor
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegator :@adapter, :extract_metadata
|
9
|
-
|
10
|
-
def initialize
|
11
|
-
@adapter = Ddr::Extraction::Adapters.get_metadata_extraction_adapter
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|