hydra-derivatives 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -1
  3. data/History.md +43 -0
  4. data/README.md +23 -38
  5. data/VERSION +1 -1
  6. data/hydra-derivatives.gemspec +0 -1
  7. data/lib/hydra/derivatives.rb +23 -123
  8. data/lib/hydra/derivatives/io_decorator.rb +7 -1
  9. data/lib/hydra/derivatives/processors.rb +19 -0
  10. data/lib/hydra/derivatives/processors/audio.rb +6 -0
  11. data/lib/hydra/derivatives/processors/document.rb +28 -0
  12. data/lib/hydra/derivatives/processors/ffmpeg.rb +22 -0
  13. data/lib/hydra/derivatives/processors/full_text.rb +60 -0
  14. data/lib/hydra/derivatives/processors/image.rb +58 -0
  15. data/lib/hydra/derivatives/processors/jpeg2k_image.rb +129 -0
  16. data/lib/hydra/derivatives/processors/processor.rb +38 -0
  17. data/lib/hydra/derivatives/processors/raw_image.rb +37 -0
  18. data/lib/hydra/derivatives/processors/shell_based_processor.rb +108 -0
  19. data/lib/hydra/derivatives/{video.rb → processors/video.rb} +1 -1
  20. data/lib/hydra/derivatives/{video → processors/video}/config.rb +1 -1
  21. data/lib/hydra/derivatives/{video → processors/video}/processor.rb +2 -8
  22. data/lib/hydra/derivatives/runners/audio_derivatives.rb +7 -0
  23. data/lib/hydra/derivatives/runners/document_derivatives.rb +7 -0
  24. data/lib/hydra/derivatives/runners/full_text_extract.rb +16 -0
  25. data/lib/hydra/derivatives/runners/image_derivatives.rb +16 -0
  26. data/lib/hydra/derivatives/runners/jpeg2k_image_derivatives.rb +15 -0
  27. data/lib/hydra/derivatives/runners/pdf_derivatives.rb +6 -0
  28. data/lib/hydra/derivatives/runners/runner.rb +52 -0
  29. data/lib/hydra/derivatives/runners/video_derivatives.rb +7 -0
  30. data/lib/hydra/derivatives/services/mime_type_service.rb +10 -0
  31. data/lib/hydra/derivatives/services/persist_basic_contained_output_file_service.rb +23 -8
  32. data/lib/hydra/derivatives/services/persist_output_file_service.rb +4 -5
  33. data/lib/hydra/derivatives/services/retrieve_source_file_service.rb +8 -6
  34. data/spec/processors/full_text.rb +61 -0
  35. data/spec/{units → processors}/image_spec.rb +7 -17
  36. data/spec/{units → processors}/jpeg2k_spec.rb +9 -11
  37. data/spec/processors/processor_spec.rb +36 -0
  38. data/spec/processors/shell_based_processor_spec.rb +19 -0
  39. data/spec/processors/video_spec.rb +40 -0
  40. data/spec/services/audio_derivatives_spec.rb +76 -0
  41. data/spec/services/persist_basic_contained_output_file_service_spec.rb +4 -3
  42. data/spec/services/retrieve_source_file_service_spec.rb +16 -12
  43. data/spec/units/derivatives_spec.rb +18 -26
  44. data/spec/units/io_decorator_spec.rb +33 -0
  45. data/spec/units/transcoding_spec.rb +109 -86
  46. metadata +42 -44
  47. data/lib/hydra/derivatives/audio.rb +0 -19
  48. data/lib/hydra/derivatives/document.rb +0 -56
  49. data/lib/hydra/derivatives/extract_metadata.rb +0 -27
  50. data/lib/hydra/derivatives/ffmpeg.rb +0 -31
  51. data/lib/hydra/derivatives/image.rb +0 -73
  52. data/lib/hydra/derivatives/jpeg2k_image.rb +0 -136
  53. data/lib/hydra/derivatives/processor.rb +0 -33
  54. data/lib/hydra/derivatives/railtie.rb +0 -9
  55. data/lib/hydra/derivatives/raw_image.rb +0 -45
  56. data/lib/hydra/derivatives/shell_based_processor.rb +0 -81
  57. data/spec/lib/hydra/derivatives/extract_metadata_spec.rb +0 -39
  58. data/spec/units/extract_spec.rb +0 -22
  59. data/spec/units/processor_spec.rb +0 -61
  60. data/spec/units/shell_based_processor_spec.rb +0 -22
  61. data/spec/units/video_spec.rb +0 -50
@@ -0,0 +1,60 @@
1
+ module Hydra::Derivatives::Processors
2
+ # Extract the full text from the content using Solr's extract handler
3
+ class FullText < Processor
4
+ # Run the full text extraction and save the result
5
+ # @return [TrueClass,FalseClass] was the process successful.
6
+ def process
7
+ output_file_service.call(extract, directives)
8
+ end
9
+
10
+ private
11
+
12
+ ##
13
+ # Extract full text from the content using Solr's extract handler.
14
+ # This will extract text from the file
15
+ #
16
+ # @return [String] The extracted text
17
+ def extract
18
+ JSON.parse(fetch)[''].rstrip
19
+ end
20
+
21
+ # send the request to the extract service and return the response if it was successful.
22
+ # TODO: this pulls the whole file into memory. We should stream it from Fedora instead
23
+ # @return [String] the result of calling the extract service
24
+ def fetch
25
+ req = Net::HTTP.new(uri.host, uri.port)
26
+ resp = req.post(uri.to_s, file_content, request_headers)
27
+ raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200'
28
+ file_content.rewind if file_content.respond_to?(:rewind)
29
+
30
+ resp.body
31
+ end
32
+
33
+ def file_content
34
+ @content ||= File.open(source_path).read
35
+ end
36
+
37
+ # @return [Hash] the request headers to send to the Solr extract service
38
+ def request_headers
39
+ { Faraday::Request::UrlEncoded::CONTENT_TYPE => "#{mime_type}",
40
+ Faraday::Adapter::CONTENT_LENGTH => original_size.to_s }
41
+ end
42
+
43
+ def mime_type
44
+ Hydra::Derivatives::MimeTypeService.mime_type(source_path)
45
+ end
46
+
47
+ def original_size
48
+ File.size(source_path)
49
+ end
50
+
51
+ # @returns [URI] path to the extract service
52
+ def uri
53
+ @uri ||= URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
54
+ end
55
+
56
+ def connection_url
57
+ ActiveFedora.solr_config[:url]
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,58 @@
1
+ require 'mini_magick'
2
+
3
+ module Hydra::Derivatives::Processors
4
+ class Image < Processor
5
+ class_attribute :timeout
6
+
7
+ def process
8
+ timeout ? process_with_timeout : process_without_timeout
9
+ end
10
+
11
+ def process_with_timeout
12
+ status = Timeout::timeout(timeout) do
13
+ process_without_timeout
14
+ end
15
+ rescue Timeout::Error => ex
16
+ raise Hydra::Derivatives::TimeoutError, "Unable to process image derivative\nThe command took longer than #{timeout} seconds to execute"
17
+ end
18
+
19
+ def process_without_timeout
20
+ format = directives.fetch(:format)
21
+ name = directives.fetch(:label, format)
22
+ destination_name = output_filename_for(name)
23
+ size = directives.fetch(:size, nil)
24
+ quality = directives.fetch(:quality, nil)
25
+ create_resized_image(destination_name, size, format, quality)
26
+ end
27
+
28
+ protected
29
+
30
+ def create_resized_image(destination_name, size, format, quality=nil)
31
+ create_image(destination_name, format, quality) do |xfrm|
32
+ xfrm.resize(size) if size.present?
33
+ end
34
+ end
35
+
36
+ def create_image(destination_name, format, quality=nil)
37
+ xfrm = load_image_transformer
38
+ yield(xfrm) if block_given?
39
+ xfrm.format(format)
40
+ xfrm.quality(quality.to_s) if quality
41
+ write_image(destination_name, format, xfrm)
42
+ end
43
+
44
+ def write_image(destination_name, format, xfrm)
45
+ output_io = StringIO.new
46
+ xfrm.write(output_io)
47
+ output_io.rewind
48
+
49
+ output_file_service.call(output_io, directives)
50
+ end
51
+
52
+ # Override this method if you want a different transformer, or need to load the
53
+ # raw image from a different source (e.g. external file)
54
+ def load_image_transformer
55
+ MiniMagick::Image.open(source_path)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,129 @@
1
+ require 'mini_magick'
2
+ require 'nokogiri'
3
+
4
+
5
+ module Hydra::Derivatives::Processors
6
+ class Jpeg2kImage < Processor
7
+ include ShellBasedProcessor
8
+
9
+ def process
10
+ image = MiniMagick::Image.open(source_path)
11
+ quality = image['%[channels]'] == 'gray' ? 'gray' : 'color'
12
+ name = directives.fetch(:label)
13
+ long_dim = self.class.long_dim(image)
14
+ file_path = self.class.tmp_file('.tif')
15
+ to_srgb = directives.fetch(:to_srgb, true)
16
+ if directives[:resize] || to_srgb
17
+ preprocess(image, resize: directives[:resize], to_srgb: to_srgb, src_quality: quality)
18
+ end
19
+ image.write file_path
20
+ recipe = self.class.kdu_compress_recipe(directives, quality, long_dim)
21
+ encode_file(recipe, file_path: file_path)
22
+ File.unlink(file_path) unless file_path.nil?
23
+ end
24
+
25
+ def encode_file(recipe, opts={})
26
+ output_file = self.class.tmp_file('.jp2')
27
+ if opts[:file_path]
28
+ self.class.encode(opts[:file_path], recipe, output_file)
29
+ else
30
+ Hydra::Derivatives::TempfileService.create(source_file) do |f|
31
+ self.class.encode(f.path, recipe, output_file)
32
+ end
33
+ end
34
+ output_file_service.call(File.open(output_file, 'rb'), directives)
35
+ File.unlink(output_file)
36
+ end
37
+
38
+ protected
39
+ def preprocess(image, opts={})
40
+ # resize: <geometry>, to_srgb: <bool>, src_quality: 'color'|'gray'
41
+ image.combine_options do |c|
42
+ c.resize(opts[:resize]) if opts[:resize]
43
+ c.profile self.class.srgb_profile_path if opts[:src_quality] == 'color' && opts[:to_srgb]
44
+ end
45
+ image
46
+ end
47
+
48
+ def self.encode(path, recipe, output_file)
49
+ kdu_compress = Hydra::Derivatives.kdu_compress_path
50
+ execute "#{kdu_compress} -i #{path} -o #{output_file} #{recipe}"
51
+ end
52
+
53
+ def self.srgb_profile_path
54
+ File.join [
55
+ File.expand_path('../../../', __FILE__),
56
+ 'color_profiles',
57
+ 'sRGB_IEC61966-2-1_no_black_scaling.icc'
58
+ ]
59
+ end
60
+
61
+ def self.tmp_file(ext)
62
+ Dir::Tmpname.create(['sufia', ext], Hydra::Derivatives.temp_file_base){}
63
+ end
64
+
65
+ def self.long_dim(image)
66
+ [image[:width], image[:height]].max
67
+ end
68
+
69
+ def self.kdu_compress_recipe(args, quality, long_dim)
70
+ if args[:recipe].is_a? Symbol
71
+ recipe = [args[:recipe].to_s, quality].join('_').to_sym
72
+ if Hydra::Derivatives.kdu_compress_recipes.has_key? recipe
73
+ return Hydra::Derivatives.kdu_compress_recipes[recipe]
74
+ else
75
+ ActiveFedora::Base.logger.warn "No JP2 recipe for :#{args[:recipe].to_s} ('#{recipe}') found in configuration. Using best guess."
76
+ return calculate_recipe(args,quality,long_dim)
77
+ end
78
+ elsif args[:recipe].is_a? String
79
+ return args[:recipe]
80
+ else
81
+ return calculate_recipe(args, quality, long_dim)
82
+ end
83
+ end
84
+
85
+ def self.calculate_recipe(args, quality, long_dim)
86
+ levels_arg = args.fetch(:levels, level_count_for_size(long_dim))
87
+ rates_arg = layer_rates(args.fetch(:layers, 8), args.fetch(:compression, 10))
88
+ tile_size = args.fetch(:tile_size, 1024)
89
+ tiles_arg = "#{tile_size},#{tile_size}"
90
+ jp2_space_arg = quality == 'gray' ? 'sLUM' : 'sRGB'
91
+
92
+ %Q{-rate #{rates_arg}
93
+ -jp2_space #{jp2_space_arg}
94
+ -double_buffering 10
95
+ -num_threads 4
96
+ -no_weights
97
+ Clevels=#{levels_arg}
98
+ "Stiles={#{tiles_arg}}"
99
+ "Cblk={64,64}"
100
+ Cuse_sop=yes
101
+ Cuse_eph=yes
102
+ Corder=RPCL
103
+ ORGgen_plt=yes
104
+ ORGtparts=R }.gsub(/\s+/, " ").strip
105
+ end
106
+
107
+ def self.level_count_for_size(long_dim)
108
+ levels = 0
109
+ level_size = long_dim
110
+ while level_size >= 96
111
+ level_size = level_size/2
112
+ levels+=1
113
+ end
114
+ levels-1
115
+ end
116
+
117
+ def self.layer_rates(layer_count,compression_numerator)
118
+ #e.g. if compression_numerator = 10 then compression is 10:1
119
+ rates = []
120
+ cmp = 24.0/compression_numerator
121
+ layer_count.times do
122
+ rates << cmp
123
+ cmp = (cmp/1.618).round(8)
124
+ end
125
+ rates.map(&:to_s ).join(',')
126
+ end
127
+
128
+ end
129
+ end
@@ -0,0 +1,38 @@
1
+ module Hydra::Derivatives::Processors
2
+ # Processors take a single input and produce a single output
3
+ class Processor
4
+ attr_accessor :source_path, :directives, :output_file_service
5
+
6
+ # @param [String] source_path path to the file on disk
7
+ # @param [Hash] directives directions for creating the output
8
+ # @option [String] :format the format of the output
9
+ # @option [String] :url the location to put the output
10
+ # @param [Hash] opts
11
+ # @option [#call] :output_file_service An output file service to call
12
+ def initialize(source_path, directives, opts={})
13
+ self.source_path = source_path
14
+ self.directives = directives
15
+ self.output_file_service = opts.fetch(:output_file_service, Hydra::Derivatives.output_file_service)
16
+ end
17
+
18
+ def process
19
+ raise "Processor is an abstract class. Implement `process' on #{self.class.name}"
20
+ end
21
+
22
+ # This governs the output key sent to the persist file service
23
+ # while this is adequate for storing in Fedora, it's not a great name for saving
24
+ # to the file system.
25
+ def output_file_id(name)
26
+ [out_prefix, name].join('_')
27
+ end
28
+
29
+ def output_filename_for(_name)
30
+ File.basename(source_path)
31
+ end
32
+
33
+ # @deprecated Please use a PersistOutputFileService class to save an object
34
+ def output_file
35
+ raise NotImplementedError, "Processor is an abstract class. Utilize an implementation of a PersistOutputFileService class in #{self.class.name}"
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,37 @@
1
+ require 'mini_magick'
2
+
3
+ module Hydra::Derivatives::Processors
4
+ class RawImage < Image
5
+ class_attribute :timeout
6
+
7
+ protected
8
+
9
+ def create_image(destination_name, format, quality=nil)
10
+ xfrm = load_image_transformer
11
+ # Transpose format and scaling due to the fact that ImageMagick can
12
+ # read but not write RAW files and this will otherwise cause many
13
+ # cryptic segmentation faults
14
+ xfrm.format(format)
15
+ yield(xfrm) if block_given?
16
+ xfrm.quality(quality.to_s) if quality
17
+ write_image(destination_name, format, xfrm)
18
+ remove_temp_files(xfrm)
19
+ end
20
+
21
+ # Delete any temp files that might clutter up the disk if
22
+ # you are doing a batch or don't touch your temporary storage
23
+ # for a long time
24
+ def remove_temp_files(xfrm)
25
+ xfrm.destroy!
26
+ end
27
+
28
+ # Override this method if you want a different transformer, or # need to load the raw image from a different source (e.g.
29
+ # external file).
30
+ #
31
+ # In this case always add an extension to help out MiniMagick
32
+ # with RAW files
33
+ def load_image_transformer
34
+ MiniMagick::Image.open(source_path)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,108 @@
1
+ # An abstract class for asyncronous jobs that transcode files using FFMpeg
2
+
3
+ require 'tmpdir'
4
+ require 'open3'
5
+
6
+ module Hydra::Derivatives::Processors
7
+ module ShellBasedProcessor
8
+ extend ActiveSupport::Concern
9
+
10
+ BLOCK_SIZE = 1024
11
+
12
+ included do
13
+ class_attribute :timeout
14
+ extend Open3
15
+ end
16
+
17
+ def process
18
+ name = directives.fetch(:label)
19
+ format = directives[:format]
20
+ raise ArgumentError, "You must provide the :format you want to transcode into. You provided #{directives}" unless format
21
+ # TODO if the source is in the correct format, we could just copy it and skip transcoding.
22
+ encode_file(format, options_for(format))
23
+ end
24
+
25
+ # override this method in subclass if you want to provide specific options.
26
+ # returns a hash of options that the specific processors use
27
+ def options_for(format)
28
+ {}
29
+ end
30
+
31
+ def encode_file(file_suffix, options)
32
+ out_file = nil
33
+ temp_file_name = output_file(file_suffix)
34
+ self.class.encode(source_path, options, temp_file_name)
35
+ output_file_service.call(File.open(temp_file_name, 'rb'), directives)
36
+ File.unlink(temp_file_name)
37
+ end
38
+
39
+ def output_file(file_suffix)
40
+ Dir::Tmpname.create(['sufia', ".#{file_suffix}"], Hydra::Derivatives.temp_file_base){}
41
+ end
42
+
43
+ module ClassMethods
44
+
45
+ def execute(command)
46
+ context = {}
47
+ if timeout
48
+ execute_with_timeout(timeout, command, context)
49
+ else
50
+ execute_without_timeout(command, context)
51
+ end
52
+ end
53
+
54
+ def execute_with_timeout(timeout, command, context)
55
+ begin
56
+ status = Timeout::timeout(timeout) do
57
+ execute_without_timeout(command, context)
58
+ end
59
+ rescue Timeout::Error => ex
60
+ pid = context[:pid]
61
+ Process.kill("KILL", pid)
62
+ raise Hydra::Derivatives::TimeoutError, "Unable to execute command \"#{command}\"\nThe command took longer than #{timeout} seconds to execute"
63
+ end
64
+
65
+ end
66
+
67
+ def execute_without_timeout(command, context)
68
+ exit_status = nil
69
+ err_str = ''
70
+ stdin, stdout, stderr, wait_thr = popen3(command)
71
+ context[:pid] = wait_thr[:pid]
72
+ stdin.close
73
+ stdout.close
74
+ files = [stderr]
75
+
76
+ until all_eof?(files) do
77
+ ready = IO.select(files, nil, nil, 60)
78
+
79
+ if ready
80
+ readable = ready[0]
81
+ readable.each do |f|
82
+ fileno = f.fileno
83
+
84
+ begin
85
+ data = f.read_nonblock(BLOCK_SIZE)
86
+
87
+ case fileno
88
+ when stderr.fileno
89
+ err_str << data
90
+ end
91
+ rescue EOFError
92
+ Rails.logger "Caught an eof error in ShellBasedProcessor"
93
+ # No big deal.
94
+ end
95
+ end
96
+ end
97
+ end
98
+ exit_status = wait_thr.value
99
+
100
+ raise "Unable to execute command \"#{command}\". Exit code: #{exit_status}\nError message: #{err_str}" unless exit_status.success?
101
+ end
102
+
103
+ def all_eof?(files)
104
+ files.find { |f| !f.eof }.nil?
105
+ end
106
+ end
107
+ end
108
+ end
@@ -1,4 +1,4 @@
1
- module Hydra::Derivatives
1
+ module Hydra::Derivatives::Processors
2
2
  module Video
3
3
  extend ActiveSupport::Autoload
4
4
 
@@ -1,4 +1,4 @@
1
- module Hydra::Derivatives::Video
1
+ module Hydra::Derivatives::Processors::Video
2
2
  class Config
3
3
  attr_writer :video_bitrate, :video_attributes, :size_attributes, :audio_attributes
4
4