hydra-derivatives 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/History.md +43 -0
- data/README.md +23 -38
- data/VERSION +1 -1
- data/hydra-derivatives.gemspec +0 -1
- data/lib/hydra/derivatives.rb +23 -123
- data/lib/hydra/derivatives/io_decorator.rb +7 -1
- data/lib/hydra/derivatives/processors.rb +19 -0
- data/lib/hydra/derivatives/processors/audio.rb +6 -0
- data/lib/hydra/derivatives/processors/document.rb +28 -0
- data/lib/hydra/derivatives/processors/ffmpeg.rb +22 -0
- data/lib/hydra/derivatives/processors/full_text.rb +60 -0
- data/lib/hydra/derivatives/processors/image.rb +58 -0
- data/lib/hydra/derivatives/processors/jpeg2k_image.rb +129 -0
- data/lib/hydra/derivatives/processors/processor.rb +38 -0
- data/lib/hydra/derivatives/processors/raw_image.rb +37 -0
- data/lib/hydra/derivatives/processors/shell_based_processor.rb +108 -0
- data/lib/hydra/derivatives/{video.rb → processors/video.rb} +1 -1
- data/lib/hydra/derivatives/{video → processors/video}/config.rb +1 -1
- data/lib/hydra/derivatives/{video → processors/video}/processor.rb +2 -8
- data/lib/hydra/derivatives/runners/audio_derivatives.rb +7 -0
- data/lib/hydra/derivatives/runners/document_derivatives.rb +7 -0
- data/lib/hydra/derivatives/runners/full_text_extract.rb +16 -0
- data/lib/hydra/derivatives/runners/image_derivatives.rb +16 -0
- data/lib/hydra/derivatives/runners/jpeg2k_image_derivatives.rb +15 -0
- data/lib/hydra/derivatives/runners/pdf_derivatives.rb +6 -0
- data/lib/hydra/derivatives/runners/runner.rb +52 -0
- data/lib/hydra/derivatives/runners/video_derivatives.rb +7 -0
- data/lib/hydra/derivatives/services/mime_type_service.rb +10 -0
- data/lib/hydra/derivatives/services/persist_basic_contained_output_file_service.rb +23 -8
- data/lib/hydra/derivatives/services/persist_output_file_service.rb +4 -5
- data/lib/hydra/derivatives/services/retrieve_source_file_service.rb +8 -6
- data/spec/processors/full_text.rb +61 -0
- data/spec/{units → processors}/image_spec.rb +7 -17
- data/spec/{units → processors}/jpeg2k_spec.rb +9 -11
- data/spec/processors/processor_spec.rb +36 -0
- data/spec/processors/shell_based_processor_spec.rb +19 -0
- data/spec/processors/video_spec.rb +40 -0
- data/spec/services/audio_derivatives_spec.rb +76 -0
- data/spec/services/persist_basic_contained_output_file_service_spec.rb +4 -3
- data/spec/services/retrieve_source_file_service_spec.rb +16 -12
- data/spec/units/derivatives_spec.rb +18 -26
- data/spec/units/io_decorator_spec.rb +33 -0
- data/spec/units/transcoding_spec.rb +109 -86
- metadata +42 -44
- data/lib/hydra/derivatives/audio.rb +0 -19
- data/lib/hydra/derivatives/document.rb +0 -56
- data/lib/hydra/derivatives/extract_metadata.rb +0 -27
- data/lib/hydra/derivatives/ffmpeg.rb +0 -31
- data/lib/hydra/derivatives/image.rb +0 -73
- data/lib/hydra/derivatives/jpeg2k_image.rb +0 -136
- data/lib/hydra/derivatives/processor.rb +0 -33
- data/lib/hydra/derivatives/railtie.rb +0 -9
- data/lib/hydra/derivatives/raw_image.rb +0 -45
- data/lib/hydra/derivatives/shell_based_processor.rb +0 -81
- data/spec/lib/hydra/derivatives/extract_metadata_spec.rb +0 -39
- data/spec/units/extract_spec.rb +0 -22
- data/spec/units/processor_spec.rb +0 -61
- data/spec/units/shell_based_processor_spec.rb +0 -22
- data/spec/units/video_spec.rb +0 -50
@@ -0,0 +1,60 @@
|
|
1
|
+
module Hydra::Derivatives::Processors
|
2
|
+
# Extract the full text from the content using Solr's extract handler
|
3
|
+
class FullText < Processor
|
4
|
+
# Run the full text extraction and save the result
|
5
|
+
# @return [TrueClass,FalseClass] was the process successful.
|
6
|
+
def process
|
7
|
+
output_file_service.call(extract, directives)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
##
|
13
|
+
# Extract full text from the content using Solr's extract handler.
|
14
|
+
# This will extract text from the file
|
15
|
+
#
|
16
|
+
# @return [String] The extracted text
|
17
|
+
def extract
|
18
|
+
JSON.parse(fetch)[''].rstrip
|
19
|
+
end
|
20
|
+
|
21
|
+
# send the request to the extract service and return the response if it was successful.
|
22
|
+
# TODO: this pulls the whole file into memory. We should stream it from Fedora instead
|
23
|
+
# @return [String] the result of calling the extract service
|
24
|
+
def fetch
|
25
|
+
req = Net::HTTP.new(uri.host, uri.port)
|
26
|
+
resp = req.post(uri.to_s, file_content, request_headers)
|
27
|
+
raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200'
|
28
|
+
file_content.rewind if file_content.respond_to?(:rewind)
|
29
|
+
|
30
|
+
resp.body
|
31
|
+
end
|
32
|
+
|
33
|
+
def file_content
|
34
|
+
@content ||= File.open(source_path).read
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [Hash] the request headers to send to the Solr extract service
|
38
|
+
def request_headers
|
39
|
+
{ Faraday::Request::UrlEncoded::CONTENT_TYPE => "#{mime_type}",
|
40
|
+
Faraday::Adapter::CONTENT_LENGTH => original_size.to_s }
|
41
|
+
end
|
42
|
+
|
43
|
+
def mime_type
|
44
|
+
Hydra::Derivatives::MimeTypeService.mime_type(source_path)
|
45
|
+
end
|
46
|
+
|
47
|
+
def original_size
|
48
|
+
File.size(source_path)
|
49
|
+
end
|
50
|
+
|
51
|
+
# @returns [URI] path to the extract service
|
52
|
+
def uri
|
53
|
+
@uri ||= URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
|
54
|
+
end
|
55
|
+
|
56
|
+
def connection_url
|
57
|
+
ActiveFedora.solr_config[:url]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
|
3
|
+
module Hydra::Derivatives::Processors
|
4
|
+
class Image < Processor
|
5
|
+
class_attribute :timeout
|
6
|
+
|
7
|
+
def process
|
8
|
+
timeout ? process_with_timeout : process_without_timeout
|
9
|
+
end
|
10
|
+
|
11
|
+
def process_with_timeout
|
12
|
+
status = Timeout::timeout(timeout) do
|
13
|
+
process_without_timeout
|
14
|
+
end
|
15
|
+
rescue Timeout::Error => ex
|
16
|
+
raise Hydra::Derivatives::TimeoutError, "Unable to process image derivative\nThe command took longer than #{timeout} seconds to execute"
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_without_timeout
|
20
|
+
format = directives.fetch(:format)
|
21
|
+
name = directives.fetch(:label, format)
|
22
|
+
destination_name = output_filename_for(name)
|
23
|
+
size = directives.fetch(:size, nil)
|
24
|
+
quality = directives.fetch(:quality, nil)
|
25
|
+
create_resized_image(destination_name, size, format, quality)
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def create_resized_image(destination_name, size, format, quality=nil)
|
31
|
+
create_image(destination_name, format, quality) do |xfrm|
|
32
|
+
xfrm.resize(size) if size.present?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def create_image(destination_name, format, quality=nil)
|
37
|
+
xfrm = load_image_transformer
|
38
|
+
yield(xfrm) if block_given?
|
39
|
+
xfrm.format(format)
|
40
|
+
xfrm.quality(quality.to_s) if quality
|
41
|
+
write_image(destination_name, format, xfrm)
|
42
|
+
end
|
43
|
+
|
44
|
+
def write_image(destination_name, format, xfrm)
|
45
|
+
output_io = StringIO.new
|
46
|
+
xfrm.write(output_io)
|
47
|
+
output_io.rewind
|
48
|
+
|
49
|
+
output_file_service.call(output_io, directives)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Override this method if you want a different transformer, or need to load the
|
53
|
+
# raw image from a different source (e.g. external file)
|
54
|
+
def load_image_transformer
|
55
|
+
MiniMagick::Image.open(source_path)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
|
5
|
+
module Hydra::Derivatives::Processors
|
6
|
+
class Jpeg2kImage < Processor
|
7
|
+
include ShellBasedProcessor
|
8
|
+
|
9
|
+
def process
|
10
|
+
image = MiniMagick::Image.open(source_path)
|
11
|
+
quality = image['%[channels]'] == 'gray' ? 'gray' : 'color'
|
12
|
+
name = directives.fetch(:label)
|
13
|
+
long_dim = self.class.long_dim(image)
|
14
|
+
file_path = self.class.tmp_file('.tif')
|
15
|
+
to_srgb = directives.fetch(:to_srgb, true)
|
16
|
+
if directives[:resize] || to_srgb
|
17
|
+
preprocess(image, resize: directives[:resize], to_srgb: to_srgb, src_quality: quality)
|
18
|
+
end
|
19
|
+
image.write file_path
|
20
|
+
recipe = self.class.kdu_compress_recipe(directives, quality, long_dim)
|
21
|
+
encode_file(recipe, file_path: file_path)
|
22
|
+
File.unlink(file_path) unless file_path.nil?
|
23
|
+
end
|
24
|
+
|
25
|
+
def encode_file(recipe, opts={})
|
26
|
+
output_file = self.class.tmp_file('.jp2')
|
27
|
+
if opts[:file_path]
|
28
|
+
self.class.encode(opts[:file_path], recipe, output_file)
|
29
|
+
else
|
30
|
+
Hydra::Derivatives::TempfileService.create(source_file) do |f|
|
31
|
+
self.class.encode(f.path, recipe, output_file)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
output_file_service.call(File.open(output_file, 'rb'), directives)
|
35
|
+
File.unlink(output_file)
|
36
|
+
end
|
37
|
+
|
38
|
+
protected
|
39
|
+
def preprocess(image, opts={})
|
40
|
+
# resize: <geometry>, to_srgb: <bool>, src_quality: 'color'|'gray'
|
41
|
+
image.combine_options do |c|
|
42
|
+
c.resize(opts[:resize]) if opts[:resize]
|
43
|
+
c.profile self.class.srgb_profile_path if opts[:src_quality] == 'color' && opts[:to_srgb]
|
44
|
+
end
|
45
|
+
image
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.encode(path, recipe, output_file)
|
49
|
+
kdu_compress = Hydra::Derivatives.kdu_compress_path
|
50
|
+
execute "#{kdu_compress} -i #{path} -o #{output_file} #{recipe}"
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.srgb_profile_path
|
54
|
+
File.join [
|
55
|
+
File.expand_path('../../../', __FILE__),
|
56
|
+
'color_profiles',
|
57
|
+
'sRGB_IEC61966-2-1_no_black_scaling.icc'
|
58
|
+
]
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.tmp_file(ext)
|
62
|
+
Dir::Tmpname.create(['sufia', ext], Hydra::Derivatives.temp_file_base){}
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.long_dim(image)
|
66
|
+
[image[:width], image[:height]].max
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.kdu_compress_recipe(args, quality, long_dim)
|
70
|
+
if args[:recipe].is_a? Symbol
|
71
|
+
recipe = [args[:recipe].to_s, quality].join('_').to_sym
|
72
|
+
if Hydra::Derivatives.kdu_compress_recipes.has_key? recipe
|
73
|
+
return Hydra::Derivatives.kdu_compress_recipes[recipe]
|
74
|
+
else
|
75
|
+
ActiveFedora::Base.logger.warn "No JP2 recipe for :#{args[:recipe].to_s} ('#{recipe}') found in configuration. Using best guess."
|
76
|
+
return calculate_recipe(args,quality,long_dim)
|
77
|
+
end
|
78
|
+
elsif args[:recipe].is_a? String
|
79
|
+
return args[:recipe]
|
80
|
+
else
|
81
|
+
return calculate_recipe(args, quality, long_dim)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.calculate_recipe(args, quality, long_dim)
|
86
|
+
levels_arg = args.fetch(:levels, level_count_for_size(long_dim))
|
87
|
+
rates_arg = layer_rates(args.fetch(:layers, 8), args.fetch(:compression, 10))
|
88
|
+
tile_size = args.fetch(:tile_size, 1024)
|
89
|
+
tiles_arg = "#{tile_size},#{tile_size}"
|
90
|
+
jp2_space_arg = quality == 'gray' ? 'sLUM' : 'sRGB'
|
91
|
+
|
92
|
+
%Q{-rate #{rates_arg}
|
93
|
+
-jp2_space #{jp2_space_arg}
|
94
|
+
-double_buffering 10
|
95
|
+
-num_threads 4
|
96
|
+
-no_weights
|
97
|
+
Clevels=#{levels_arg}
|
98
|
+
"Stiles={#{tiles_arg}}"
|
99
|
+
"Cblk={64,64}"
|
100
|
+
Cuse_sop=yes
|
101
|
+
Cuse_eph=yes
|
102
|
+
Corder=RPCL
|
103
|
+
ORGgen_plt=yes
|
104
|
+
ORGtparts=R }.gsub(/\s+/, " ").strip
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.level_count_for_size(long_dim)
|
108
|
+
levels = 0
|
109
|
+
level_size = long_dim
|
110
|
+
while level_size >= 96
|
111
|
+
level_size = level_size/2
|
112
|
+
levels+=1
|
113
|
+
end
|
114
|
+
levels-1
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.layer_rates(layer_count,compression_numerator)
|
118
|
+
#e.g. if compression_numerator = 10 then compression is 10:1
|
119
|
+
rates = []
|
120
|
+
cmp = 24.0/compression_numerator
|
121
|
+
layer_count.times do
|
122
|
+
rates << cmp
|
123
|
+
cmp = (cmp/1.618).round(8)
|
124
|
+
end
|
125
|
+
rates.map(&:to_s ).join(',')
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Hydra::Derivatives::Processors
|
2
|
+
# Processors take a single input and produce a single output
|
3
|
+
class Processor
|
4
|
+
attr_accessor :source_path, :directives, :output_file_service
|
5
|
+
|
6
|
+
# @param [String] source_path path to the file on disk
|
7
|
+
# @param [Hash] directives directions for creating the output
|
8
|
+
# @option [String] :format the format of the output
|
9
|
+
# @option [String] :url the location to put the output
|
10
|
+
# @param [Hash] opts
|
11
|
+
# @option [#call] :output_file_service An output file service to call
|
12
|
+
def initialize(source_path, directives, opts={})
|
13
|
+
self.source_path = source_path
|
14
|
+
self.directives = directives
|
15
|
+
self.output_file_service = opts.fetch(:output_file_service, Hydra::Derivatives.output_file_service)
|
16
|
+
end
|
17
|
+
|
18
|
+
def process
|
19
|
+
raise "Processor is an abstract class. Implement `process' on #{self.class.name}"
|
20
|
+
end
|
21
|
+
|
22
|
+
# This governs the output key sent to the persist file service
|
23
|
+
# while this is adequate for storing in Fedora, it's not a great name for saving
|
24
|
+
# to the file system.
|
25
|
+
def output_file_id(name)
|
26
|
+
[out_prefix, name].join('_')
|
27
|
+
end
|
28
|
+
|
29
|
+
def output_filename_for(_name)
|
30
|
+
File.basename(source_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
# @deprecated Please use a PersistOutputFileService class to save an object
|
34
|
+
def output_file
|
35
|
+
raise NotImplementedError, "Processor is an abstract class. Utilize an implementation of a PersistOutputFileService class in #{self.class.name}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
|
3
|
+
module Hydra::Derivatives::Processors
|
4
|
+
class RawImage < Image
|
5
|
+
class_attribute :timeout
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
def create_image(destination_name, format, quality=nil)
|
10
|
+
xfrm = load_image_transformer
|
11
|
+
# Transpose format and scaling due to the fact that ImageMagick can
|
12
|
+
# read but not write RAW files and this will otherwise cause many
|
13
|
+
# cryptic segmentation faults
|
14
|
+
xfrm.format(format)
|
15
|
+
yield(xfrm) if block_given?
|
16
|
+
xfrm.quality(quality.to_s) if quality
|
17
|
+
write_image(destination_name, format, xfrm)
|
18
|
+
remove_temp_files(xfrm)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Delete any temp files that might clutter up the disk if
|
22
|
+
# you are doing a batch or don't touch your temporary storage
|
23
|
+
# for a long time
|
24
|
+
def remove_temp_files(xfrm)
|
25
|
+
xfrm.destroy!
|
26
|
+
end
|
27
|
+
|
28
|
+
# Override this method if you want a different transformer, or # need to load the raw image from a different source (e.g.
|
29
|
+
# external file).
|
30
|
+
#
|
31
|
+
# In this case always add an extension to help out MiniMagick
|
32
|
+
# with RAW files
|
33
|
+
def load_image_transformer
|
34
|
+
MiniMagick::Image.open(source_path)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# An abstract class for asyncronous jobs that transcode files using FFMpeg
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'open3'
|
5
|
+
|
6
|
+
module Hydra::Derivatives::Processors
|
7
|
+
module ShellBasedProcessor
|
8
|
+
extend ActiveSupport::Concern
|
9
|
+
|
10
|
+
BLOCK_SIZE = 1024
|
11
|
+
|
12
|
+
included do
|
13
|
+
class_attribute :timeout
|
14
|
+
extend Open3
|
15
|
+
end
|
16
|
+
|
17
|
+
def process
|
18
|
+
name = directives.fetch(:label)
|
19
|
+
format = directives[:format]
|
20
|
+
raise ArgumentError, "You must provide the :format you want to transcode into. You provided #{directives}" unless format
|
21
|
+
# TODO if the source is in the correct format, we could just copy it and skip transcoding.
|
22
|
+
encode_file(format, options_for(format))
|
23
|
+
end
|
24
|
+
|
25
|
+
# override this method in subclass if you want to provide specific options.
|
26
|
+
# returns a hash of options that the specific processors use
|
27
|
+
def options_for(format)
|
28
|
+
{}
|
29
|
+
end
|
30
|
+
|
31
|
+
def encode_file(file_suffix, options)
|
32
|
+
out_file = nil
|
33
|
+
temp_file_name = output_file(file_suffix)
|
34
|
+
self.class.encode(source_path, options, temp_file_name)
|
35
|
+
output_file_service.call(File.open(temp_file_name, 'rb'), directives)
|
36
|
+
File.unlink(temp_file_name)
|
37
|
+
end
|
38
|
+
|
39
|
+
def output_file(file_suffix)
|
40
|
+
Dir::Tmpname.create(['sufia', ".#{file_suffix}"], Hydra::Derivatives.temp_file_base){}
|
41
|
+
end
|
42
|
+
|
43
|
+
module ClassMethods
|
44
|
+
|
45
|
+
def execute(command)
|
46
|
+
context = {}
|
47
|
+
if timeout
|
48
|
+
execute_with_timeout(timeout, command, context)
|
49
|
+
else
|
50
|
+
execute_without_timeout(command, context)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def execute_with_timeout(timeout, command, context)
|
55
|
+
begin
|
56
|
+
status = Timeout::timeout(timeout) do
|
57
|
+
execute_without_timeout(command, context)
|
58
|
+
end
|
59
|
+
rescue Timeout::Error => ex
|
60
|
+
pid = context[:pid]
|
61
|
+
Process.kill("KILL", pid)
|
62
|
+
raise Hydra::Derivatives::TimeoutError, "Unable to execute command \"#{command}\"\nThe command took longer than #{timeout} seconds to execute"
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
def execute_without_timeout(command, context)
|
68
|
+
exit_status = nil
|
69
|
+
err_str = ''
|
70
|
+
stdin, stdout, stderr, wait_thr = popen3(command)
|
71
|
+
context[:pid] = wait_thr[:pid]
|
72
|
+
stdin.close
|
73
|
+
stdout.close
|
74
|
+
files = [stderr]
|
75
|
+
|
76
|
+
until all_eof?(files) do
|
77
|
+
ready = IO.select(files, nil, nil, 60)
|
78
|
+
|
79
|
+
if ready
|
80
|
+
readable = ready[0]
|
81
|
+
readable.each do |f|
|
82
|
+
fileno = f.fileno
|
83
|
+
|
84
|
+
begin
|
85
|
+
data = f.read_nonblock(BLOCK_SIZE)
|
86
|
+
|
87
|
+
case fileno
|
88
|
+
when stderr.fileno
|
89
|
+
err_str << data
|
90
|
+
end
|
91
|
+
rescue EOFError
|
92
|
+
Rails.logger "Caught an eof error in ShellBasedProcessor"
|
93
|
+
# No big deal.
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
exit_status = wait_thr.value
|
99
|
+
|
100
|
+
raise "Unable to execute command \"#{command}\". Exit code: #{exit_status}\nError message: #{err_str}" unless exit_status.success?
|
101
|
+
end
|
102
|
+
|
103
|
+
def all_eof?(files)
|
104
|
+
files.find { |f| !f.eof }.nil?
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|