hydra-derivatives 2.0.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/History.md +43 -0
- data/README.md +23 -38
- data/VERSION +1 -1
- data/hydra-derivatives.gemspec +0 -1
- data/lib/hydra/derivatives.rb +23 -123
- data/lib/hydra/derivatives/io_decorator.rb +7 -1
- data/lib/hydra/derivatives/processors.rb +19 -0
- data/lib/hydra/derivatives/processors/audio.rb +6 -0
- data/lib/hydra/derivatives/processors/document.rb +28 -0
- data/lib/hydra/derivatives/processors/ffmpeg.rb +22 -0
- data/lib/hydra/derivatives/processors/full_text.rb +60 -0
- data/lib/hydra/derivatives/processors/image.rb +58 -0
- data/lib/hydra/derivatives/processors/jpeg2k_image.rb +129 -0
- data/lib/hydra/derivatives/processors/processor.rb +38 -0
- data/lib/hydra/derivatives/processors/raw_image.rb +37 -0
- data/lib/hydra/derivatives/processors/shell_based_processor.rb +108 -0
- data/lib/hydra/derivatives/{video.rb → processors/video.rb} +1 -1
- data/lib/hydra/derivatives/{video → processors/video}/config.rb +1 -1
- data/lib/hydra/derivatives/{video → processors/video}/processor.rb +2 -8
- data/lib/hydra/derivatives/runners/audio_derivatives.rb +7 -0
- data/lib/hydra/derivatives/runners/document_derivatives.rb +7 -0
- data/lib/hydra/derivatives/runners/full_text_extract.rb +16 -0
- data/lib/hydra/derivatives/runners/image_derivatives.rb +16 -0
- data/lib/hydra/derivatives/runners/jpeg2k_image_derivatives.rb +15 -0
- data/lib/hydra/derivatives/runners/pdf_derivatives.rb +6 -0
- data/lib/hydra/derivatives/runners/runner.rb +52 -0
- data/lib/hydra/derivatives/runners/video_derivatives.rb +7 -0
- data/lib/hydra/derivatives/services/mime_type_service.rb +10 -0
- data/lib/hydra/derivatives/services/persist_basic_contained_output_file_service.rb +23 -8
- data/lib/hydra/derivatives/services/persist_output_file_service.rb +4 -5
- data/lib/hydra/derivatives/services/retrieve_source_file_service.rb +8 -6
- data/spec/processors/full_text.rb +61 -0
- data/spec/{units → processors}/image_spec.rb +7 -17
- data/spec/{units → processors}/jpeg2k_spec.rb +9 -11
- data/spec/processors/processor_spec.rb +36 -0
- data/spec/processors/shell_based_processor_spec.rb +19 -0
- data/spec/processors/video_spec.rb +40 -0
- data/spec/services/audio_derivatives_spec.rb +76 -0
- data/spec/services/persist_basic_contained_output_file_service_spec.rb +4 -3
- data/spec/services/retrieve_source_file_service_spec.rb +16 -12
- data/spec/units/derivatives_spec.rb +18 -26
- data/spec/units/io_decorator_spec.rb +33 -0
- data/spec/units/transcoding_spec.rb +109 -86
- metadata +42 -44
- data/lib/hydra/derivatives/audio.rb +0 -19
- data/lib/hydra/derivatives/document.rb +0 -56
- data/lib/hydra/derivatives/extract_metadata.rb +0 -27
- data/lib/hydra/derivatives/ffmpeg.rb +0 -31
- data/lib/hydra/derivatives/image.rb +0 -73
- data/lib/hydra/derivatives/jpeg2k_image.rb +0 -136
- data/lib/hydra/derivatives/processor.rb +0 -33
- data/lib/hydra/derivatives/railtie.rb +0 -9
- data/lib/hydra/derivatives/raw_image.rb +0 -45
- data/lib/hydra/derivatives/shell_based_processor.rb +0 -81
- data/spec/lib/hydra/derivatives/extract_metadata_spec.rb +0 -39
- data/spec/units/extract_spec.rb +0 -22
- data/spec/units/processor_spec.rb +0 -61
- data/spec/units/shell_based_processor_spec.rb +0 -22
- data/spec/units/video_spec.rb +0 -50
@@ -0,0 +1,60 @@
|
|
1
|
+
module Hydra::Derivatives::Processors
|
2
|
+
# Extract the full text from the content using Solr's extract handler
|
3
|
+
class FullText < Processor
|
4
|
+
# Run the full text extraction and save the result
|
5
|
+
# @return [TrueClass,FalseClass] was the process successful.
|
6
|
+
def process
|
7
|
+
output_file_service.call(extract, directives)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
##
|
13
|
+
# Extract full text from the content using Solr's extract handler.
|
14
|
+
# This will extract text from the file
|
15
|
+
#
|
16
|
+
# @return [String] The extracted text
|
17
|
+
def extract
|
18
|
+
JSON.parse(fetch)[''].rstrip
|
19
|
+
end
|
20
|
+
|
21
|
+
# send the request to the extract service and return the response if it was successful.
|
22
|
+
# TODO: this pulls the whole file into memory. We should stream it from Fedora instead
|
23
|
+
# @return [String] the result of calling the extract service
|
24
|
+
def fetch
|
25
|
+
req = Net::HTTP.new(uri.host, uri.port)
|
26
|
+
resp = req.post(uri.to_s, file_content, request_headers)
|
27
|
+
raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200'
|
28
|
+
file_content.rewind if file_content.respond_to?(:rewind)
|
29
|
+
|
30
|
+
resp.body
|
31
|
+
end
|
32
|
+
|
33
|
+
def file_content
|
34
|
+
@content ||= File.open(source_path).read
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [Hash] the request headers to send to the Solr extract service
|
38
|
+
def request_headers
|
39
|
+
{ Faraday::Request::UrlEncoded::CONTENT_TYPE => "#{mime_type}",
|
40
|
+
Faraday::Adapter::CONTENT_LENGTH => original_size.to_s }
|
41
|
+
end
|
42
|
+
|
43
|
+
def mime_type
|
44
|
+
Hydra::Derivatives::MimeTypeService.mime_type(source_path)
|
45
|
+
end
|
46
|
+
|
47
|
+
def original_size
|
48
|
+
File.size(source_path)
|
49
|
+
end
|
50
|
+
|
51
|
+
# @returns [URI] path to the extract service
|
52
|
+
def uri
|
53
|
+
@uri ||= URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
|
54
|
+
end
|
55
|
+
|
56
|
+
def connection_url
|
57
|
+
ActiveFedora.solr_config[:url]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
|
3
|
+
module Hydra::Derivatives::Processors
|
4
|
+
class Image < Processor
|
5
|
+
class_attribute :timeout
|
6
|
+
|
7
|
+
def process
|
8
|
+
timeout ? process_with_timeout : process_without_timeout
|
9
|
+
end
|
10
|
+
|
11
|
+
def process_with_timeout
|
12
|
+
status = Timeout::timeout(timeout) do
|
13
|
+
process_without_timeout
|
14
|
+
end
|
15
|
+
rescue Timeout::Error => ex
|
16
|
+
raise Hydra::Derivatives::TimeoutError, "Unable to process image derivative\nThe command took longer than #{timeout} seconds to execute"
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_without_timeout
|
20
|
+
format = directives.fetch(:format)
|
21
|
+
name = directives.fetch(:label, format)
|
22
|
+
destination_name = output_filename_for(name)
|
23
|
+
size = directives.fetch(:size, nil)
|
24
|
+
quality = directives.fetch(:quality, nil)
|
25
|
+
create_resized_image(destination_name, size, format, quality)
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def create_resized_image(destination_name, size, format, quality=nil)
|
31
|
+
create_image(destination_name, format, quality) do |xfrm|
|
32
|
+
xfrm.resize(size) if size.present?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def create_image(destination_name, format, quality=nil)
|
37
|
+
xfrm = load_image_transformer
|
38
|
+
yield(xfrm) if block_given?
|
39
|
+
xfrm.format(format)
|
40
|
+
xfrm.quality(quality.to_s) if quality
|
41
|
+
write_image(destination_name, format, xfrm)
|
42
|
+
end
|
43
|
+
|
44
|
+
def write_image(destination_name, format, xfrm)
|
45
|
+
output_io = StringIO.new
|
46
|
+
xfrm.write(output_io)
|
47
|
+
output_io.rewind
|
48
|
+
|
49
|
+
output_file_service.call(output_io, directives)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Override this method if you want a different transformer, or need to load the
|
53
|
+
# raw image from a different source (e.g. external file)
|
54
|
+
def load_image_transformer
|
55
|
+
MiniMagick::Image.open(source_path)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
|
5
|
+
module Hydra::Derivatives::Processors
|
6
|
+
class Jpeg2kImage < Processor
|
7
|
+
include ShellBasedProcessor
|
8
|
+
|
9
|
+
def process
|
10
|
+
image = MiniMagick::Image.open(source_path)
|
11
|
+
quality = image['%[channels]'] == 'gray' ? 'gray' : 'color'
|
12
|
+
name = directives.fetch(:label)
|
13
|
+
long_dim = self.class.long_dim(image)
|
14
|
+
file_path = self.class.tmp_file('.tif')
|
15
|
+
to_srgb = directives.fetch(:to_srgb, true)
|
16
|
+
if directives[:resize] || to_srgb
|
17
|
+
preprocess(image, resize: directives[:resize], to_srgb: to_srgb, src_quality: quality)
|
18
|
+
end
|
19
|
+
image.write file_path
|
20
|
+
recipe = self.class.kdu_compress_recipe(directives, quality, long_dim)
|
21
|
+
encode_file(recipe, file_path: file_path)
|
22
|
+
File.unlink(file_path) unless file_path.nil?
|
23
|
+
end
|
24
|
+
|
25
|
+
def encode_file(recipe, opts={})
|
26
|
+
output_file = self.class.tmp_file('.jp2')
|
27
|
+
if opts[:file_path]
|
28
|
+
self.class.encode(opts[:file_path], recipe, output_file)
|
29
|
+
else
|
30
|
+
Hydra::Derivatives::TempfileService.create(source_file) do |f|
|
31
|
+
self.class.encode(f.path, recipe, output_file)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
output_file_service.call(File.open(output_file, 'rb'), directives)
|
35
|
+
File.unlink(output_file)
|
36
|
+
end
|
37
|
+
|
38
|
+
protected
|
39
|
+
def preprocess(image, opts={})
|
40
|
+
# resize: <geometry>, to_srgb: <bool>, src_quality: 'color'|'gray'
|
41
|
+
image.combine_options do |c|
|
42
|
+
c.resize(opts[:resize]) if opts[:resize]
|
43
|
+
c.profile self.class.srgb_profile_path if opts[:src_quality] == 'color' && opts[:to_srgb]
|
44
|
+
end
|
45
|
+
image
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.encode(path, recipe, output_file)
|
49
|
+
kdu_compress = Hydra::Derivatives.kdu_compress_path
|
50
|
+
execute "#{kdu_compress} -i #{path} -o #{output_file} #{recipe}"
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.srgb_profile_path
|
54
|
+
File.join [
|
55
|
+
File.expand_path('../../../', __FILE__),
|
56
|
+
'color_profiles',
|
57
|
+
'sRGB_IEC61966-2-1_no_black_scaling.icc'
|
58
|
+
]
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.tmp_file(ext)
|
62
|
+
Dir::Tmpname.create(['sufia', ext], Hydra::Derivatives.temp_file_base){}
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.long_dim(image)
|
66
|
+
[image[:width], image[:height]].max
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.kdu_compress_recipe(args, quality, long_dim)
|
70
|
+
if args[:recipe].is_a? Symbol
|
71
|
+
recipe = [args[:recipe].to_s, quality].join('_').to_sym
|
72
|
+
if Hydra::Derivatives.kdu_compress_recipes.has_key? recipe
|
73
|
+
return Hydra::Derivatives.kdu_compress_recipes[recipe]
|
74
|
+
else
|
75
|
+
ActiveFedora::Base.logger.warn "No JP2 recipe for :#{args[:recipe].to_s} ('#{recipe}') found in configuration. Using best guess."
|
76
|
+
return calculate_recipe(args,quality,long_dim)
|
77
|
+
end
|
78
|
+
elsif args[:recipe].is_a? String
|
79
|
+
return args[:recipe]
|
80
|
+
else
|
81
|
+
return calculate_recipe(args, quality, long_dim)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.calculate_recipe(args, quality, long_dim)
|
86
|
+
levels_arg = args.fetch(:levels, level_count_for_size(long_dim))
|
87
|
+
rates_arg = layer_rates(args.fetch(:layers, 8), args.fetch(:compression, 10))
|
88
|
+
tile_size = args.fetch(:tile_size, 1024)
|
89
|
+
tiles_arg = "#{tile_size},#{tile_size}"
|
90
|
+
jp2_space_arg = quality == 'gray' ? 'sLUM' : 'sRGB'
|
91
|
+
|
92
|
+
%Q{-rate #{rates_arg}
|
93
|
+
-jp2_space #{jp2_space_arg}
|
94
|
+
-double_buffering 10
|
95
|
+
-num_threads 4
|
96
|
+
-no_weights
|
97
|
+
Clevels=#{levels_arg}
|
98
|
+
"Stiles={#{tiles_arg}}"
|
99
|
+
"Cblk={64,64}"
|
100
|
+
Cuse_sop=yes
|
101
|
+
Cuse_eph=yes
|
102
|
+
Corder=RPCL
|
103
|
+
ORGgen_plt=yes
|
104
|
+
ORGtparts=R }.gsub(/\s+/, " ").strip
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.level_count_for_size(long_dim)
|
108
|
+
levels = 0
|
109
|
+
level_size = long_dim
|
110
|
+
while level_size >= 96
|
111
|
+
level_size = level_size/2
|
112
|
+
levels+=1
|
113
|
+
end
|
114
|
+
levels-1
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.layer_rates(layer_count,compression_numerator)
|
118
|
+
#e.g. if compression_numerator = 10 then compression is 10:1
|
119
|
+
rates = []
|
120
|
+
cmp = 24.0/compression_numerator
|
121
|
+
layer_count.times do
|
122
|
+
rates << cmp
|
123
|
+
cmp = (cmp/1.618).round(8)
|
124
|
+
end
|
125
|
+
rates.map(&:to_s ).join(',')
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Hydra::Derivatives::Processors
|
2
|
+
# Processors take a single input and produce a single output
|
3
|
+
class Processor
|
4
|
+
attr_accessor :source_path, :directives, :output_file_service
|
5
|
+
|
6
|
+
# @param [String] source_path path to the file on disk
|
7
|
+
# @param [Hash] directives directions for creating the output
|
8
|
+
# @option [String] :format the format of the output
|
9
|
+
# @option [String] :url the location to put the output
|
10
|
+
# @param [Hash] opts
|
11
|
+
# @option [#call] :output_file_service An output file service to call
|
12
|
+
def initialize(source_path, directives, opts={})
|
13
|
+
self.source_path = source_path
|
14
|
+
self.directives = directives
|
15
|
+
self.output_file_service = opts.fetch(:output_file_service, Hydra::Derivatives.output_file_service)
|
16
|
+
end
|
17
|
+
|
18
|
+
def process
|
19
|
+
raise "Processor is an abstract class. Implement `process' on #{self.class.name}"
|
20
|
+
end
|
21
|
+
|
22
|
+
# This governs the output key sent to the persist file service
|
23
|
+
# while this is adequate for storing in Fedora, it's not a great name for saving
|
24
|
+
# to the file system.
|
25
|
+
def output_file_id(name)
|
26
|
+
[out_prefix, name].join('_')
|
27
|
+
end
|
28
|
+
|
29
|
+
def output_filename_for(_name)
|
30
|
+
File.basename(source_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
# @deprecated Please use a PersistOutputFileService class to save an object
|
34
|
+
def output_file
|
35
|
+
raise NotImplementedError, "Processor is an abstract class. Utilize an implementation of a PersistOutputFileService class in #{self.class.name}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'mini_magick'
|
2
|
+
|
3
|
+
module Hydra::Derivatives::Processors
|
4
|
+
class RawImage < Image
|
5
|
+
class_attribute :timeout
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
def create_image(destination_name, format, quality=nil)
|
10
|
+
xfrm = load_image_transformer
|
11
|
+
# Transpose format and scaling due to the fact that ImageMagick can
|
12
|
+
# read but not write RAW files and this will otherwise cause many
|
13
|
+
# cryptic segmentation faults
|
14
|
+
xfrm.format(format)
|
15
|
+
yield(xfrm) if block_given?
|
16
|
+
xfrm.quality(quality.to_s) if quality
|
17
|
+
write_image(destination_name, format, xfrm)
|
18
|
+
remove_temp_files(xfrm)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Delete any temp files that might clutter up the disk if
|
22
|
+
# you are doing a batch or don't touch your temporary storage
|
23
|
+
# for a long time
|
24
|
+
def remove_temp_files(xfrm)
|
25
|
+
xfrm.destroy!
|
26
|
+
end
|
27
|
+
|
28
|
+
# Override this method if you want a different transformer, or # need to load the raw image from a different source (e.g.
|
29
|
+
# external file).
|
30
|
+
#
|
31
|
+
# In this case always add an extension to help out MiniMagick
|
32
|
+
# with RAW files
|
33
|
+
def load_image_transformer
|
34
|
+
MiniMagick::Image.open(source_path)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# An abstract class for asyncronous jobs that transcode files using FFMpeg
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'open3'
|
5
|
+
|
6
|
+
module Hydra::Derivatives::Processors
|
7
|
+
module ShellBasedProcessor
|
8
|
+
extend ActiveSupport::Concern
|
9
|
+
|
10
|
+
BLOCK_SIZE = 1024
|
11
|
+
|
12
|
+
included do
|
13
|
+
class_attribute :timeout
|
14
|
+
extend Open3
|
15
|
+
end
|
16
|
+
|
17
|
+
def process
|
18
|
+
name = directives.fetch(:label)
|
19
|
+
format = directives[:format]
|
20
|
+
raise ArgumentError, "You must provide the :format you want to transcode into. You provided #{directives}" unless format
|
21
|
+
# TODO if the source is in the correct format, we could just copy it and skip transcoding.
|
22
|
+
encode_file(format, options_for(format))
|
23
|
+
end
|
24
|
+
|
25
|
+
# override this method in subclass if you want to provide specific options.
|
26
|
+
# returns a hash of options that the specific processors use
|
27
|
+
def options_for(format)
|
28
|
+
{}
|
29
|
+
end
|
30
|
+
|
31
|
+
def encode_file(file_suffix, options)
|
32
|
+
out_file = nil
|
33
|
+
temp_file_name = output_file(file_suffix)
|
34
|
+
self.class.encode(source_path, options, temp_file_name)
|
35
|
+
output_file_service.call(File.open(temp_file_name, 'rb'), directives)
|
36
|
+
File.unlink(temp_file_name)
|
37
|
+
end
|
38
|
+
|
39
|
+
def output_file(file_suffix)
|
40
|
+
Dir::Tmpname.create(['sufia', ".#{file_suffix}"], Hydra::Derivatives.temp_file_base){}
|
41
|
+
end
|
42
|
+
|
43
|
+
module ClassMethods
|
44
|
+
|
45
|
+
def execute(command)
|
46
|
+
context = {}
|
47
|
+
if timeout
|
48
|
+
execute_with_timeout(timeout, command, context)
|
49
|
+
else
|
50
|
+
execute_without_timeout(command, context)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def execute_with_timeout(timeout, command, context)
|
55
|
+
begin
|
56
|
+
status = Timeout::timeout(timeout) do
|
57
|
+
execute_without_timeout(command, context)
|
58
|
+
end
|
59
|
+
rescue Timeout::Error => ex
|
60
|
+
pid = context[:pid]
|
61
|
+
Process.kill("KILL", pid)
|
62
|
+
raise Hydra::Derivatives::TimeoutError, "Unable to execute command \"#{command}\"\nThe command took longer than #{timeout} seconds to execute"
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
def execute_without_timeout(command, context)
|
68
|
+
exit_status = nil
|
69
|
+
err_str = ''
|
70
|
+
stdin, stdout, stderr, wait_thr = popen3(command)
|
71
|
+
context[:pid] = wait_thr[:pid]
|
72
|
+
stdin.close
|
73
|
+
stdout.close
|
74
|
+
files = [stderr]
|
75
|
+
|
76
|
+
until all_eof?(files) do
|
77
|
+
ready = IO.select(files, nil, nil, 60)
|
78
|
+
|
79
|
+
if ready
|
80
|
+
readable = ready[0]
|
81
|
+
readable.each do |f|
|
82
|
+
fileno = f.fileno
|
83
|
+
|
84
|
+
begin
|
85
|
+
data = f.read_nonblock(BLOCK_SIZE)
|
86
|
+
|
87
|
+
case fileno
|
88
|
+
when stderr.fileno
|
89
|
+
err_str << data
|
90
|
+
end
|
91
|
+
rescue EOFError
|
92
|
+
Rails.logger "Caught an eof error in ShellBasedProcessor"
|
93
|
+
# No big deal.
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
exit_status = wait_thr.value
|
99
|
+
|
100
|
+
raise "Unable to execute command \"#{command}\". Exit code: #{exit_status}\nError message: #{err_str}" unless exit_status.success?
|
101
|
+
end
|
102
|
+
|
103
|
+
def all_eof?(files)
|
104
|
+
files.find { |f| !f.eof }.nil?
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|