hydra-derivatives 2.0.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -1
  3. data/History.md +43 -0
  4. data/README.md +23 -38
  5. data/VERSION +1 -1
  6. data/hydra-derivatives.gemspec +0 -1
  7. data/lib/hydra/derivatives.rb +23 -123
  8. data/lib/hydra/derivatives/io_decorator.rb +7 -1
  9. data/lib/hydra/derivatives/processors.rb +19 -0
  10. data/lib/hydra/derivatives/processors/audio.rb +6 -0
  11. data/lib/hydra/derivatives/processors/document.rb +28 -0
  12. data/lib/hydra/derivatives/processors/ffmpeg.rb +22 -0
  13. data/lib/hydra/derivatives/processors/full_text.rb +60 -0
  14. data/lib/hydra/derivatives/processors/image.rb +58 -0
  15. data/lib/hydra/derivatives/processors/jpeg2k_image.rb +129 -0
  16. data/lib/hydra/derivatives/processors/processor.rb +38 -0
  17. data/lib/hydra/derivatives/processors/raw_image.rb +37 -0
  18. data/lib/hydra/derivatives/processors/shell_based_processor.rb +108 -0
  19. data/lib/hydra/derivatives/{video.rb → processors/video.rb} +1 -1
  20. data/lib/hydra/derivatives/{video → processors/video}/config.rb +1 -1
  21. data/lib/hydra/derivatives/{video → processors/video}/processor.rb +2 -8
  22. data/lib/hydra/derivatives/runners/audio_derivatives.rb +7 -0
  23. data/lib/hydra/derivatives/runners/document_derivatives.rb +7 -0
  24. data/lib/hydra/derivatives/runners/full_text_extract.rb +16 -0
  25. data/lib/hydra/derivatives/runners/image_derivatives.rb +16 -0
  26. data/lib/hydra/derivatives/runners/jpeg2k_image_derivatives.rb +15 -0
  27. data/lib/hydra/derivatives/runners/pdf_derivatives.rb +6 -0
  28. data/lib/hydra/derivatives/runners/runner.rb +52 -0
  29. data/lib/hydra/derivatives/runners/video_derivatives.rb +7 -0
  30. data/lib/hydra/derivatives/services/mime_type_service.rb +10 -0
  31. data/lib/hydra/derivatives/services/persist_basic_contained_output_file_service.rb +23 -8
  32. data/lib/hydra/derivatives/services/persist_output_file_service.rb +4 -5
  33. data/lib/hydra/derivatives/services/retrieve_source_file_service.rb +8 -6
  34. data/spec/processors/full_text.rb +61 -0
  35. data/spec/{units → processors}/image_spec.rb +7 -17
  36. data/spec/{units → processors}/jpeg2k_spec.rb +9 -11
  37. data/spec/processors/processor_spec.rb +36 -0
  38. data/spec/processors/shell_based_processor_spec.rb +19 -0
  39. data/spec/processors/video_spec.rb +40 -0
  40. data/spec/services/audio_derivatives_spec.rb +76 -0
  41. data/spec/services/persist_basic_contained_output_file_service_spec.rb +4 -3
  42. data/spec/services/retrieve_source_file_service_spec.rb +16 -12
  43. data/spec/units/derivatives_spec.rb +18 -26
  44. data/spec/units/io_decorator_spec.rb +33 -0
  45. data/spec/units/transcoding_spec.rb +109 -86
  46. metadata +42 -44
  47. data/lib/hydra/derivatives/audio.rb +0 -19
  48. data/lib/hydra/derivatives/document.rb +0 -56
  49. data/lib/hydra/derivatives/extract_metadata.rb +0 -27
  50. data/lib/hydra/derivatives/ffmpeg.rb +0 -31
  51. data/lib/hydra/derivatives/image.rb +0 -73
  52. data/lib/hydra/derivatives/jpeg2k_image.rb +0 -136
  53. data/lib/hydra/derivatives/processor.rb +0 -33
  54. data/lib/hydra/derivatives/railtie.rb +0 -9
  55. data/lib/hydra/derivatives/raw_image.rb +0 -45
  56. data/lib/hydra/derivatives/shell_based_processor.rb +0 -81
  57. data/spec/lib/hydra/derivatives/extract_metadata_spec.rb +0 -39
  58. data/spec/units/extract_spec.rb +0 -22
  59. data/spec/units/processor_spec.rb +0 -61
  60. data/spec/units/shell_based_processor_spec.rb +0 -22
  61. data/spec/units/video_spec.rb +0 -50
@@ -0,0 +1,60 @@
1
+ module Hydra::Derivatives::Processors
2
+ # Extract the full text from the content using Solr's extract handler
3
+ class FullText < Processor
4
+ # Run the full text extraction and save the result
5
+ # @return [TrueClass,FalseClass] was the process successful.
6
+ def process
7
+ output_file_service.call(extract, directives)
8
+ end
9
+
10
+ private
11
+
12
+ ##
13
+ # Extract full text from the content using Solr's extract handler.
14
+ # This will extract text from the file
15
+ #
16
+ # @return [String] The extracted text
17
+ def extract
18
+ JSON.parse(fetch)[''].rstrip
19
+ end
20
+
21
+ # send the request to the extract service and return the response if it was successful.
22
+ # TODO: this pulls the whole file into memory. We should stream it from Fedora instead
23
+ # @return [String] the result of calling the extract service
24
+ def fetch
25
+ req = Net::HTTP.new(uri.host, uri.port)
26
+ resp = req.post(uri.to_s, file_content, request_headers)
27
+ raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200'
28
+ file_content.rewind if file_content.respond_to?(:rewind)
29
+
30
+ resp.body
31
+ end
32
+
33
+ def file_content
34
+ @content ||= File.open(source_path).read
35
+ end
36
+
37
+ # @return [Hash] the request headers to send to the Solr extract service
38
+ def request_headers
39
+ { Faraday::Request::UrlEncoded::CONTENT_TYPE => "#{mime_type}",
40
+ Faraday::Adapter::CONTENT_LENGTH => original_size.to_s }
41
+ end
42
+
43
+ def mime_type
44
+ Hydra::Derivatives::MimeTypeService.mime_type(source_path)
45
+ end
46
+
47
+ def original_size
48
+ File.size(source_path)
49
+ end
50
+
51
+ # @returns [URI] path to the extract service
52
+ def uri
53
+ @uri ||= URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
54
+ end
55
+
56
+ def connection_url
57
+ ActiveFedora.solr_config[:url]
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,58 @@
1
+ require 'mini_magick'
2
+
3
+ module Hydra::Derivatives::Processors
4
+ class Image < Processor
5
+ class_attribute :timeout
6
+
7
+ def process
8
+ timeout ? process_with_timeout : process_without_timeout
9
+ end
10
+
11
+ def process_with_timeout
12
+ status = Timeout::timeout(timeout) do
13
+ process_without_timeout
14
+ end
15
+ rescue Timeout::Error => ex
16
+ raise Hydra::Derivatives::TimeoutError, "Unable to process image derivative\nThe command took longer than #{timeout} seconds to execute"
17
+ end
18
+
19
+ def process_without_timeout
20
+ format = directives.fetch(:format)
21
+ name = directives.fetch(:label, format)
22
+ destination_name = output_filename_for(name)
23
+ size = directives.fetch(:size, nil)
24
+ quality = directives.fetch(:quality, nil)
25
+ create_resized_image(destination_name, size, format, quality)
26
+ end
27
+
28
+ protected
29
+
30
+ def create_resized_image(destination_name, size, format, quality=nil)
31
+ create_image(destination_name, format, quality) do |xfrm|
32
+ xfrm.resize(size) if size.present?
33
+ end
34
+ end
35
+
36
+ def create_image(destination_name, format, quality=nil)
37
+ xfrm = load_image_transformer
38
+ yield(xfrm) if block_given?
39
+ xfrm.format(format)
40
+ xfrm.quality(quality.to_s) if quality
41
+ write_image(destination_name, format, xfrm)
42
+ end
43
+
44
+ def write_image(destination_name, format, xfrm)
45
+ output_io = StringIO.new
46
+ xfrm.write(output_io)
47
+ output_io.rewind
48
+
49
+ output_file_service.call(output_io, directives)
50
+ end
51
+
52
+ # Override this method if you want a different transformer, or need to load the
53
+ # raw image from a different source (e.g. external file)
54
+ def load_image_transformer
55
+ MiniMagick::Image.open(source_path)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,129 @@
1
+ require 'mini_magick'
2
+ require 'nokogiri'
3
+
4
+
5
+ module Hydra::Derivatives::Processors
6
+ class Jpeg2kImage < Processor
7
+ include ShellBasedProcessor
8
+
9
+ def process
10
+ image = MiniMagick::Image.open(source_path)
11
+ quality = image['%[channels]'] == 'gray' ? 'gray' : 'color'
12
+ name = directives.fetch(:label)
13
+ long_dim = self.class.long_dim(image)
14
+ file_path = self.class.tmp_file('.tif')
15
+ to_srgb = directives.fetch(:to_srgb, true)
16
+ if directives[:resize] || to_srgb
17
+ preprocess(image, resize: directives[:resize], to_srgb: to_srgb, src_quality: quality)
18
+ end
19
+ image.write file_path
20
+ recipe = self.class.kdu_compress_recipe(directives, quality, long_dim)
21
+ encode_file(recipe, file_path: file_path)
22
+ File.unlink(file_path) unless file_path.nil?
23
+ end
24
+
25
+ def encode_file(recipe, opts={})
26
+ output_file = self.class.tmp_file('.jp2')
27
+ if opts[:file_path]
28
+ self.class.encode(opts[:file_path], recipe, output_file)
29
+ else
30
+ Hydra::Derivatives::TempfileService.create(source_file) do |f|
31
+ self.class.encode(f.path, recipe, output_file)
32
+ end
33
+ end
34
+ output_file_service.call(File.open(output_file, 'rb'), directives)
35
+ File.unlink(output_file)
36
+ end
37
+
38
+ protected
39
+ def preprocess(image, opts={})
40
+ # resize: <geometry>, to_srgb: <bool>, src_quality: 'color'|'gray'
41
+ image.combine_options do |c|
42
+ c.resize(opts[:resize]) if opts[:resize]
43
+ c.profile self.class.srgb_profile_path if opts[:src_quality] == 'color' && opts[:to_srgb]
44
+ end
45
+ image
46
+ end
47
+
48
+ def self.encode(path, recipe, output_file)
49
+ kdu_compress = Hydra::Derivatives.kdu_compress_path
50
+ execute "#{kdu_compress} -i #{path} -o #{output_file} #{recipe}"
51
+ end
52
+
53
+ def self.srgb_profile_path
54
+ File.join [
55
+ File.expand_path('../../../', __FILE__),
56
+ 'color_profiles',
57
+ 'sRGB_IEC61966-2-1_no_black_scaling.icc'
58
+ ]
59
+ end
60
+
61
+ def self.tmp_file(ext)
62
+ Dir::Tmpname.create(['sufia', ext], Hydra::Derivatives.temp_file_base){}
63
+ end
64
+
65
+ def self.long_dim(image)
66
+ [image[:width], image[:height]].max
67
+ end
68
+
69
+ def self.kdu_compress_recipe(args, quality, long_dim)
70
+ if args[:recipe].is_a? Symbol
71
+ recipe = [args[:recipe].to_s, quality].join('_').to_sym
72
+ if Hydra::Derivatives.kdu_compress_recipes.has_key? recipe
73
+ return Hydra::Derivatives.kdu_compress_recipes[recipe]
74
+ else
75
+ ActiveFedora::Base.logger.warn "No JP2 recipe for :#{args[:recipe].to_s} ('#{recipe}') found in configuration. Using best guess."
76
+ return calculate_recipe(args,quality,long_dim)
77
+ end
78
+ elsif args[:recipe].is_a? String
79
+ return args[:recipe]
80
+ else
81
+ return calculate_recipe(args, quality, long_dim)
82
+ end
83
+ end
84
+
85
+ def self.calculate_recipe(args, quality, long_dim)
86
+ levels_arg = args.fetch(:levels, level_count_for_size(long_dim))
87
+ rates_arg = layer_rates(args.fetch(:layers, 8), args.fetch(:compression, 10))
88
+ tile_size = args.fetch(:tile_size, 1024)
89
+ tiles_arg = "#{tile_size},#{tile_size}"
90
+ jp2_space_arg = quality == 'gray' ? 'sLUM' : 'sRGB'
91
+
92
+ %Q{-rate #{rates_arg}
93
+ -jp2_space #{jp2_space_arg}
94
+ -double_buffering 10
95
+ -num_threads 4
96
+ -no_weights
97
+ Clevels=#{levels_arg}
98
+ "Stiles={#{tiles_arg}}"
99
+ "Cblk={64,64}"
100
+ Cuse_sop=yes
101
+ Cuse_eph=yes
102
+ Corder=RPCL
103
+ ORGgen_plt=yes
104
+ ORGtparts=R }.gsub(/\s+/, " ").strip
105
+ end
106
+
107
+ def self.level_count_for_size(long_dim)
108
+ levels = 0
109
+ level_size = long_dim
110
+ while level_size >= 96
111
+ level_size = level_size/2
112
+ levels+=1
113
+ end
114
+ levels-1
115
+ end
116
+
117
+ def self.layer_rates(layer_count,compression_numerator)
118
+ #e.g. if compression_numerator = 10 then compression is 10:1
119
+ rates = []
120
+ cmp = 24.0/compression_numerator
121
+ layer_count.times do
122
+ rates << cmp
123
+ cmp = (cmp/1.618).round(8)
124
+ end
125
+ rates.map(&:to_s ).join(',')
126
+ end
127
+
128
+ end
129
+ end
@@ -0,0 +1,38 @@
1
+ module Hydra::Derivatives::Processors
2
+ # Processors take a single input and produce a single output
3
+ class Processor
4
+ attr_accessor :source_path, :directives, :output_file_service
5
+
6
+ # @param [String] source_path path to the file on disk
7
+ # @param [Hash] directives directions for creating the output
8
+ # @option [String] :format the format of the output
9
+ # @option [String] :url the location to put the output
10
+ # @param [Hash] opts
11
+ # @option [#call] :output_file_service An output file service to call
12
+ def initialize(source_path, directives, opts={})
13
+ self.source_path = source_path
14
+ self.directives = directives
15
+ self.output_file_service = opts.fetch(:output_file_service, Hydra::Derivatives.output_file_service)
16
+ end
17
+
18
+ def process
19
+ raise "Processor is an abstract class. Implement `process' on #{self.class.name}"
20
+ end
21
+
22
+ # This governs the output key sent to the persist file service
23
+ # while this is adequate for storing in Fedora, it's not a great name for saving
24
+ # to the file system.
25
+ def output_file_id(name)
26
+ [out_prefix, name].join('_')
27
+ end
28
+
29
+ def output_filename_for(_name)
30
+ File.basename(source_path)
31
+ end
32
+
33
+ # @deprecated Please use a PersistOutputFileService class to save an object
34
+ def output_file
35
+ raise NotImplementedError, "Processor is an abstract class. Utilize an implementation of a PersistOutputFileService class in #{self.class.name}"
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,37 @@
1
+ require 'mini_magick'
2
+
3
+ module Hydra::Derivatives::Processors
4
+ class RawImage < Image
5
+ class_attribute :timeout
6
+
7
+ protected
8
+
9
+ def create_image(destination_name, format, quality=nil)
10
+ xfrm = load_image_transformer
11
+ # Transpose format and scaling due to the fact that ImageMagick can
12
+ # read but not write RAW files and this will otherwise cause many
13
+ # cryptic segmentation faults
14
+ xfrm.format(format)
15
+ yield(xfrm) if block_given?
16
+ xfrm.quality(quality.to_s) if quality
17
+ write_image(destination_name, format, xfrm)
18
+ remove_temp_files(xfrm)
19
+ end
20
+
21
+ # Delete any temp files that might clutter up the disk if
22
+ # you are doing a batch or don't touch your temporary storage
23
+ # for a long time
24
+ def remove_temp_files(xfrm)
25
+ xfrm.destroy!
26
+ end
27
+
28
+ # Override this method if you want a different transformer, or # need to load the raw image from a different source (e.g.
29
+ # external file).
30
+ #
31
+ # In this case always add an extension to help out MiniMagick
32
+ # with RAW files
33
+ def load_image_transformer
34
+ MiniMagick::Image.open(source_path)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,108 @@
1
+ # An abstract class for asyncronous jobs that transcode files using FFMpeg
2
+
3
+ require 'tmpdir'
4
+ require 'open3'
5
+
6
+ module Hydra::Derivatives::Processors
7
+ module ShellBasedProcessor
8
+ extend ActiveSupport::Concern
9
+
10
+ BLOCK_SIZE = 1024
11
+
12
+ included do
13
+ class_attribute :timeout
14
+ extend Open3
15
+ end
16
+
17
+ def process
18
+ name = directives.fetch(:label)
19
+ format = directives[:format]
20
+ raise ArgumentError, "You must provide the :format you want to transcode into. You provided #{directives}" unless format
21
+ # TODO if the source is in the correct format, we could just copy it and skip transcoding.
22
+ encode_file(format, options_for(format))
23
+ end
24
+
25
+ # override this method in subclass if you want to provide specific options.
26
+ # returns a hash of options that the specific processors use
27
+ def options_for(format)
28
+ {}
29
+ end
30
+
31
+ def encode_file(file_suffix, options)
32
+ out_file = nil
33
+ temp_file_name = output_file(file_suffix)
34
+ self.class.encode(source_path, options, temp_file_name)
35
+ output_file_service.call(File.open(temp_file_name, 'rb'), directives)
36
+ File.unlink(temp_file_name)
37
+ end
38
+
39
+ def output_file(file_suffix)
40
+ Dir::Tmpname.create(['sufia', ".#{file_suffix}"], Hydra::Derivatives.temp_file_base){}
41
+ end
42
+
43
+ module ClassMethods
44
+
45
+ def execute(command)
46
+ context = {}
47
+ if timeout
48
+ execute_with_timeout(timeout, command, context)
49
+ else
50
+ execute_without_timeout(command, context)
51
+ end
52
+ end
53
+
54
+ def execute_with_timeout(timeout, command, context)
55
+ begin
56
+ status = Timeout::timeout(timeout) do
57
+ execute_without_timeout(command, context)
58
+ end
59
+ rescue Timeout::Error => ex
60
+ pid = context[:pid]
61
+ Process.kill("KILL", pid)
62
+ raise Hydra::Derivatives::TimeoutError, "Unable to execute command \"#{command}\"\nThe command took longer than #{timeout} seconds to execute"
63
+ end
64
+
65
+ end
66
+
67
+ def execute_without_timeout(command, context)
68
+ exit_status = nil
69
+ err_str = ''
70
+ stdin, stdout, stderr, wait_thr = popen3(command)
71
+ context[:pid] = wait_thr[:pid]
72
+ stdin.close
73
+ stdout.close
74
+ files = [stderr]
75
+
76
+ until all_eof?(files) do
77
+ ready = IO.select(files, nil, nil, 60)
78
+
79
+ if ready
80
+ readable = ready[0]
81
+ readable.each do |f|
82
+ fileno = f.fileno
83
+
84
+ begin
85
+ data = f.read_nonblock(BLOCK_SIZE)
86
+
87
+ case fileno
88
+ when stderr.fileno
89
+ err_str << data
90
+ end
91
+ rescue EOFError
92
+ Rails.logger "Caught an eof error in ShellBasedProcessor"
93
+ # No big deal.
94
+ end
95
+ end
96
+ end
97
+ end
98
+ exit_status = wait_thr.value
99
+
100
+ raise "Unable to execute command \"#{command}\". Exit code: #{exit_status}\nError message: #{err_str}" unless exit_status.success?
101
+ end
102
+
103
+ def all_eof?(files)
104
+ files.find { |f| !f.eof }.nil?
105
+ end
106
+ end
107
+ end
108
+ end
@@ -1,4 +1,4 @@
1
- module Hydra::Derivatives
1
+ module Hydra::Derivatives::Processors
2
2
  module Video
3
3
  extend ActiveSupport::Autoload
4
4
 
@@ -1,4 +1,4 @@
1
- module Hydra::Derivatives::Video
1
+ module Hydra::Derivatives::Processors::Video
2
2
  class Config
3
3
  attr_writer :video_bitrate, :video_attributes, :size_attributes, :audio_attributes
4
4