derivative-rodeo 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +15 -0
- data/README.md +251 -0
- data/Rakefile +42 -0
- data/derivative_rodeo.gemspec +54 -0
- data/lib/derivative/rodeo.rb +3 -0
- data/lib/derivative-rodeo.rb +3 -0
- data/lib/derivative_rodeo/configuration.rb +95 -0
- data/lib/derivative_rodeo/errors.rb +56 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
- data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
- data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
- data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
- data/lib/derivative_rodeo/services/base_service.rb +15 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
- data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
- data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
- data/lib/derivative_rodeo/services/image_service.rb +73 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
- data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
- data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
- data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
- data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
- data/lib/derivative_rodeo/services/url_service.rb +42 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
- data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
- data/lib/derivative_rodeo/technical_metadata.rb +23 -0
- data/lib/derivative_rodeo/version.rb +5 -0
- data/lib/derivative_rodeo.rb +36 -0
- metadata +339 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'open3'
|
3
|
+
require 'mini_magick'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Services
|
7
|
+
module PdfSplitter
|
8
|
+
# A simple data structure that summarizes the image properties of the given path.
|
9
|
+
PagesSummary = Struct.new(
|
10
|
+
:path, :page_count, :width,
|
11
|
+
:height, :pixels_per_inch, :color_description,
|
12
|
+
:channels, :bits_per_channel, keyword_init: true
|
13
|
+
) do
|
14
|
+
# class constant column numbers
|
15
|
+
COL_WIDTH = 3
|
16
|
+
COL_HEIGHT = 4
|
17
|
+
COL_COLOR_DESC = 5
|
18
|
+
COL_CHANNELS = 6
|
19
|
+
COL_BITS = 7
|
20
|
+
# only poppler 0.25+ has this column in output:
|
21
|
+
COL_XPPI = 12
|
22
|
+
|
23
|
+
# @return [Array<String, Integer, Integer>]
|
24
|
+
def color
|
25
|
+
[color_description, channels, bits_per_channel]
|
26
|
+
end
|
27
|
+
alias_method :ppi, :pixels_per_inch
|
28
|
+
alias_method :bits, :bits_per_channel
|
29
|
+
|
30
|
+
# If the underlying extraction couldn't set the various properties, we likely have an
|
31
|
+
# invalid_pdf.
|
32
|
+
def valid?
|
33
|
+
return false if pdf_pages_summary.color_description.nil?
|
34
|
+
return false if pdf_pages_summary.channels.nil?
|
35
|
+
return false if pdf_pages_summary.bits_per_channel.nil?
|
36
|
+
return false if pdf_pages_summary.height.nil?
|
37
|
+
return false if pdf_pages_summary.page_count.to_i.zero?
|
38
|
+
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# @api public
|
45
|
+
#
|
46
|
+
# @param path [String]
|
47
|
+
# @return [DerivativeRodeo::PdfSplitter::PagesSummary]
|
48
|
+
#
|
49
|
+
# Responsible for determining the image properties of the PDF.
|
50
|
+
#
|
51
|
+
# @note
|
52
|
+
#
|
53
|
+
# Uses poppler 0.19+ pdfimages command to extract image listing metadata from PDF files.
|
54
|
+
# Though we are optimizing for 0.25 or later for poppler.
|
55
|
+
#
|
56
|
+
# @note
|
57
|
+
#
|
58
|
+
# For dpi extraction, falls back to calculating using MiniMagick, if neccessary.
|
59
|
+
#
|
60
|
+
# The first two lines are tabular header information:
|
61
|
+
#
|
62
|
+
# @example Output from PDF Images
|
63
|
+
#
|
64
|
+
# bash-5.1$ pdfimages -list fmc_color.pdf | head -5
|
65
|
+
# page num step width height color comp bpc enc interp object ID x-ppi y-ppi size ratio
|
66
|
+
# --------------------------------------------------------------------------------------------
|
67
|
+
# 1 0 image 2475 413 rgb 3 8 jpeg no 10 0 300 300 21.8K 0.7%
|
68
|
+
# rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
|
69
|
+
# rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
|
70
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
71
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
72
|
+
def PagesSummary.extract_from(path:)
|
73
|
+
# NOTE: https://github.com/scientist-softserv/iiif_print/pull/223/files for piping warnings
|
74
|
+
# to /dev/null
|
75
|
+
command = format('pdfimages -list %<path>s 2>/dev/null', path: path)
|
76
|
+
|
77
|
+
page_count = 0
|
78
|
+
color_description = 'gray'
|
79
|
+
width = 0
|
80
|
+
height = 0
|
81
|
+
channels = 0
|
82
|
+
bits_per_channel = 0
|
83
|
+
pixels_per_inch = 0
|
84
|
+
Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
|
85
|
+
stdout.read.split("\n").each_with_index do |line, index|
|
86
|
+
# Skip the two header lines (see the above example)
|
87
|
+
next if index <= 1
|
88
|
+
|
89
|
+
page_count += 1
|
90
|
+
cells = line.gsub(/\s+/m, ' ').strip.split(' ')
|
91
|
+
|
92
|
+
color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
|
93
|
+
width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > width
|
94
|
+
height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > height
|
95
|
+
channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > channels
|
96
|
+
bits_per_channel = cells[COL_BITS].to_i if cells[COL_BITS].to_i > bits_per_channel
|
97
|
+
|
98
|
+
# In the case of poppler version < 0.25, we will have no more than 12 columns. As such,
|
99
|
+
# we need to do some alternative magic to calculate this.
|
100
|
+
if page_count == 1 && cells.size <= 12
|
101
|
+
pdf = MiniMagick::Image.open(path)
|
102
|
+
width_points = pdf.width
|
103
|
+
width_px = width
|
104
|
+
pixels_per_inch = (72 * width_px / width_points).to_i
|
105
|
+
elsif cells[COL_XPPI].to_i > pixels_per_inch
|
106
|
+
pixels_per_inch = cells[COL_XPPI].to_i
|
107
|
+
end
|
108
|
+
# By the magic of nil#to_i if we don't have more than 12 columns, we've already set
|
109
|
+
# the pixels_per_inch and this line won't due much of anything.
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
new(
|
114
|
+
path: path,
|
115
|
+
page_count: page_count,
|
116
|
+
pixels_per_inch: pixels_per_inch,
|
117
|
+
width: width,
|
118
|
+
height: height,
|
119
|
+
color_description: color_description,
|
120
|
+
channels: channels,
|
121
|
+
bits_per_channel: bits_per_channel
|
122
|
+
)
|
123
|
+
end
|
124
|
+
# rubocop:enable Metrics/AbcSize
|
125
|
+
# rubocop:enable Metrics/MethodLength
|
126
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
127
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
module PdfSplitter
|
6
|
+
# The purpose of this class is to split the PDF into constituent png files.
|
7
|
+
class PngPage < PdfSplitter::Base
|
8
|
+
self.image_extension = 'png'
|
9
|
+
|
10
|
+
def gsdevice
|
11
|
+
return @gsdevice if defined?(@gsdevice)
|
12
|
+
|
13
|
+
color = pdf_pages_summary.color_description
|
14
|
+
bits_per_channel = pdf_pages_summary.bits_per_channel
|
15
|
+
if color == 'gray'
|
16
|
+
# 1 Bit Grayscale, if applicable:
|
17
|
+
return @gsdevice = 'pngmonod' if bits_per_channel == 1
|
18
|
+
return @gsdevice = 'pnggray' if bits_per_channel > 1
|
19
|
+
end
|
20
|
+
|
21
|
+
@gsdevice = 'png16m'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
module PdfSplitter
|
6
|
+
##
|
7
|
+
# The purpose of this class is to split the PDF into constituent tiff files.
|
8
|
+
class TiffPage < PdfSplitter::Base
|
9
|
+
self.image_extension = 'tiff'
|
10
|
+
self.compression = 'lzw'
|
11
|
+
|
12
|
+
##
|
13
|
+
# @api private
|
14
|
+
#
|
15
|
+
# @return [String]
|
16
|
+
def gsdevice
|
17
|
+
return @gsdevice if defined?(@gsdevice)
|
18
|
+
|
19
|
+
color = pdf_pages_summary.color_description
|
20
|
+
channels = pdf_pages_summary.channels
|
21
|
+
bpc = pdf_pages_summary.bits_per_channel
|
22
|
+
|
23
|
+
@gsdevice = color_bpc(color, bpc)
|
24
|
+
|
25
|
+
# otherwise color:
|
26
|
+
@gsdevice ||= colordevice(channels, bpc)
|
27
|
+
end
|
28
|
+
|
29
|
+
def color_bpc(color, bpc)
|
30
|
+
return unless color == 'gray'
|
31
|
+
|
32
|
+
# CCITT Group 4 Black and White, if applicable:
|
33
|
+
if bpc == 1
|
34
|
+
self.compression = 'g4'
|
35
|
+
'tiffg4'
|
36
|
+
elsif bpc > 1
|
37
|
+
# 8 Bit Grayscale, if applicable:
|
38
|
+
'tiffgray'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def colordevice(channels, bpc)
|
43
|
+
bits = bpc * channels
|
44
|
+
# will be either 8bpc/16bpd color TIFF,
|
45
|
+
# with any CMYK source transformed to 8bpc RBG
|
46
|
+
bits = 24 unless [24, 48].include? bits
|
47
|
+
"tiff#{bits}nc"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
module PdfSplitterService
|
6
|
+
##
|
7
|
+
# @api public
|
8
|
+
#
|
9
|
+
# Find the {PdfSplitter::Base} with the given name.
|
10
|
+
#
|
11
|
+
# @param name [#to_s]
|
12
|
+
# @return [PdfSplitter::Base]
|
13
|
+
def self.for(name)
|
14
|
+
klass_name = "#{name.to_s.classify}_page".classify
|
15
|
+
"DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'httparty'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Services
|
7
|
+
##
|
8
|
+
# A utility class for handling general URLs. Provided as a means of easing the implementation
|
9
|
+
# logic of those that use this class.
|
10
|
+
#
|
11
|
+
# @note
|
12
|
+
# It is a good design idea to wrap a library (in this case HTTParty). The goal is to expose
|
13
|
+
# the smallest interface and make it something that would be easy to swap out.
|
14
|
+
#
|
15
|
+
# @see https://rubygems.org/gems/httparty
|
16
|
+
module UrlService
|
17
|
+
##
|
18
|
+
# @param url [String]
|
19
|
+
#
|
20
|
+
# @return [String]
|
21
|
+
def self.read(url)
|
22
|
+
HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
|
23
|
+
rescue StandardError => e
|
24
|
+
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
25
|
+
raise e
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# @param url [String]
|
30
|
+
#
|
31
|
+
# @return [URI] when the URL resolves successfully
|
32
|
+
# @return [FalseClass] when the URL's head request is not successful or we've exhausted our
|
33
|
+
# remaining redirects.
|
34
|
+
def self.exists?(url)
|
35
|
+
HTTParty.head(url, logger: DerivativeRodeo.config.logger)
|
36
|
+
rescue StandardError => e
|
37
|
+
DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
38
|
+
false
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,251 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module StorageLocations
|
7
|
+
##
|
8
|
+
# When the output location is the same type of location as "this" location, we indicate that via
|
9
|
+
# the SAME constant.
|
10
|
+
SAME = :same
|
11
|
+
|
12
|
+
##
|
13
|
+
# The base location for storing files.
|
14
|
+
#
|
15
|
+
# - dir :: is the directory path
|
16
|
+
# - path :: is the full file path
|
17
|
+
# - uri :: is the full file path plus the uri prefix parts
|
18
|
+
#
|
19
|
+
# A location represents a pointer to a storage location. The {#exist?} method can answer if a
|
20
|
+
# file exists at the path.
|
21
|
+
#
|
22
|
+
# rubocop:disable Metrics/ClassLength
|
23
|
+
class BaseLocation
|
24
|
+
@locations = []
|
25
|
+
|
26
|
+
##
|
27
|
+
# @return [Array<String>]
|
28
|
+
def self.locations
|
29
|
+
@locations ||= []
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.inherited(subclass)
|
33
|
+
locations << subclass.location_name
|
34
|
+
super
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# @return [String]
|
39
|
+
def self.location_name
|
40
|
+
to_s.demodulize.underscore.sub(/_location$/, '')
|
41
|
+
end
|
42
|
+
|
43
|
+
class << self
|
44
|
+
alias scheme location_name
|
45
|
+
end
|
46
|
+
|
47
|
+
##
|
48
|
+
# @param location_name [String]
|
49
|
+
#
|
50
|
+
# @return [Class]
|
51
|
+
def self.load_location(location_name)
|
52
|
+
location_name = location_name.split("://").first
|
53
|
+
raise Errors::StorageLocationNotFoundError.new(location_name: location_name) unless locations.include?(location_name)
|
54
|
+
"DerivativeRodeo::StorageLocations::#{location_name.to_s.classify}Location".constantize
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# @param file_uri [String] of the form scheme://arbitrary-stuff
|
59
|
+
#
|
60
|
+
# @return [BaseLocation]
|
61
|
+
def self.from_uri(file_uri)
|
62
|
+
location_name = file_uri.split('://').first
|
63
|
+
raise Errors::StorageLocationMissing.new(file_uri: file_uri) if location_name.blank?
|
64
|
+
|
65
|
+
load_location(location_name).new(file_uri)
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Registers the location with the main StorageLocation class to it can be used
|
70
|
+
#
|
71
|
+
# @param location_name [String]
|
72
|
+
def self.register_location(location_name)
|
73
|
+
return if DerivativeRodeo::StorageLocations::BaseLocation.locations.include?(location_name.to_s)
|
74
|
+
|
75
|
+
DerivativeRodeo::StorageLocations::BaseLocation.locations << location_name.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# Create a new uri of the classes type. Parts argument should have a default in
|
80
|
+
# implementing classes. Must support a number or the symbol :all
|
81
|
+
#
|
82
|
+
# @api public
|
83
|
+
#
|
84
|
+
# @param path [String]
|
85
|
+
# @param parts [Integer, :all]
|
86
|
+
# @return [String]
|
87
|
+
#
|
88
|
+
# @see .file_path_from_parts
|
89
|
+
def self.create_uri(path:, parts:)
|
90
|
+
raise NotImplementedError, "#{self.class}.create_uri"
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Build a {StorageLocations::BaseLocation} by converting the :from_uri with the :template via
|
95
|
+
# the given :service.
|
96
|
+
#
|
97
|
+
# @param from_uri [String]
|
98
|
+
# @param template [String]
|
99
|
+
# @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
|
100
|
+
#
|
101
|
+
# @return [StorageLocations::BaseLocation]
|
102
|
+
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
|
103
|
+
# HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
|
104
|
+
from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
|
105
|
+
to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
|
106
|
+
new(to_uri)
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# @param path [String]
|
111
|
+
# @param parts [Integer, :all]
|
112
|
+
#
|
113
|
+
# @return [String]
|
114
|
+
def self.file_path_from_parts(path:, parts:)
|
115
|
+
parts = - parts unless parts == :all || parts.negative?
|
116
|
+
parts == :all ? path : path.split('/')[parts..-1].join('/')
|
117
|
+
end
|
118
|
+
|
119
|
+
##
|
120
|
+
# @param file_uri [String] a URI to the file's location; this is **not** a templated URI (as
|
121
|
+
# described in {DerivativeRodeo::Services::ConvertUriViaTemplateService}
|
122
|
+
# @param config [DerivativeRodeo::Configuration]
|
123
|
+
def initialize(file_uri, config: DerivativeRodeo.config)
|
124
|
+
@file_uri = file_uri
|
125
|
+
@config = config
|
126
|
+
end
|
127
|
+
|
128
|
+
attr_accessor :tmp_file_path
|
129
|
+
private :tmp_file_path=, :tmp_file_path
|
130
|
+
|
131
|
+
attr_reader :config, :file_uri
|
132
|
+
|
133
|
+
##
|
134
|
+
# @param auto_write_file [Boolean] Provided as a testing helper method.
|
135
|
+
#
|
136
|
+
# @yieldparam tmp_file_path [String]
|
137
|
+
#
|
138
|
+
# @return [StorageLocations::BaseLocation]
|
139
|
+
# @see with_tmp_path
|
140
|
+
def with_new_tmp_path(auto_write_file: true, &block)
|
141
|
+
with_tmp_path(lambda { |_file_path, tmp_file_path, exist|
|
142
|
+
FileUtils.rm_rf(tmp_file_path) if exist
|
143
|
+
FileUtils.touch(tmp_file_path)
|
144
|
+
}, auto_write_file: auto_write_file, &block)
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# @yieldparam tmp_file_path [String]
|
149
|
+
# @return [StorageLocations::BaseLocation]
|
150
|
+
def with_existing_tmp_path
|
151
|
+
raise NotImplementedError, "#{self.class}#with_existing_tmp_path"
|
152
|
+
end
|
153
|
+
|
154
|
+
##
|
155
|
+
# @param preamble_lambda [Lambda, #call] the "function" we should call to prepare the
|
156
|
+
# temporary location before we yield it's location.
|
157
|
+
#
|
158
|
+
# @param auto_write_file [Boolean] Provided as a testing helper method. Given that we have
|
159
|
+
# both {#with_new_tmp_path} and {#with_existing_tmp_path}, we want the default to not
|
160
|
+
# automatically perform the write. But this is something we can easily forget when
|
161
|
+
# working with the {#with_new_tmp_path}
|
162
|
+
#
|
163
|
+
# @yieldparam tmp_file_path [String]
|
164
|
+
#
|
165
|
+
# @return [StorageLocations::BaseLocation]
|
166
|
+
def with_tmp_path(preamble_lambda, auto_write_file: false)
|
167
|
+
raise ArgumentError, 'Expected a block' unless block_given?
|
168
|
+
|
169
|
+
tmp_file_dir do |tmpdir|
|
170
|
+
self.tmp_file_path = File.join(tmpdir, file_dir, file_name)
|
171
|
+
FileUtils.mkdir_p(File.dirname(tmp_file_path))
|
172
|
+
preamble_lambda.call(file_path, tmp_file_path, exist?)
|
173
|
+
yield tmp_file_path
|
174
|
+
write if auto_write_file
|
175
|
+
end
|
176
|
+
# TODO: Do we need to ensure this?
|
177
|
+
self.tmp_file_path = nil
|
178
|
+
|
179
|
+
# In returning self we again remove the need for those calling #with_new_tmp_path,
|
180
|
+
# #with_tmp_path, and #with_new_tmp_path to remember to return the current Location.
|
181
|
+
# In other words removing the jagged edges of the code.
|
182
|
+
self
|
183
|
+
end
|
184
|
+
|
185
|
+
##
|
186
|
+
# Write the tmp file to the file_uri
|
187
|
+
def write
|
188
|
+
raise NotImplementedError, "#{self.class}#write"
|
189
|
+
end
|
190
|
+
|
191
|
+
##
|
192
|
+
# @return [TrueClass] when the file exists in this storage
|
193
|
+
# @return [FalseClass] when the file does not exist in this storage
|
194
|
+
def exist?
|
195
|
+
raise NotImplementedError, "#{self.class}#exist?"
|
196
|
+
end
|
197
|
+
alias exists? exist?
|
198
|
+
|
199
|
+
##
|
200
|
+
# @param template [String]
|
201
|
+
# @return [StorageLocations::BaseLocation]
|
202
|
+
#
|
203
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
204
|
+
def derived_file_from(template:)
|
205
|
+
klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
|
206
|
+
klass.build(from_uri: file_path, template: template)
|
207
|
+
end
|
208
|
+
|
209
|
+
##
|
210
|
+
# @param extension [String, StorageLocations::SAME]
|
211
|
+
# @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
|
212
|
+
# the file's extension.
|
213
|
+
def with_new_extension(extension)
|
214
|
+
return file_path if extension == StorageLocations::SAME
|
215
|
+
|
216
|
+
"#{file_path.split('.')[0]}.#{extension}"
|
217
|
+
end
|
218
|
+
|
219
|
+
def file_path
|
220
|
+
@file_path ||= @file_uri.sub(%r{.+://}, '')
|
221
|
+
end
|
222
|
+
|
223
|
+
def file_dir
|
224
|
+
@file_dir ||= File.dirname(file_path)
|
225
|
+
end
|
226
|
+
|
227
|
+
def file_name
|
228
|
+
@file_name ||= File.basename(file_path)
|
229
|
+
end
|
230
|
+
|
231
|
+
def file_extension
|
232
|
+
@file_extension ||= File.extname(file_path)
|
233
|
+
end
|
234
|
+
|
235
|
+
def file_basename
|
236
|
+
@file_basename ||= File.basename(file_path, file_extension)
|
237
|
+
end
|
238
|
+
|
239
|
+
def tmp_file_dir(&block)
|
240
|
+
raise ArgumentError, 'Expected a block' unless block_given?
|
241
|
+
|
242
|
+
Dir.mktmpdir(&block)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
# rubocop:enable Metrics/ClassLength
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
Dir.glob(File.join(__dir__, '**/*')).sort.each do |location|
|
250
|
+
require location unless File.directory?(location) || location.match?('base_location')
|
251
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'httparty'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module StorageLocations
|
7
|
+
##
|
8
|
+
# A helper module for copying files from one location to another.
|
9
|
+
module DownloadConcern
|
10
|
+
extend ActiveSupport::Concern
|
11
|
+
|
12
|
+
class_methods do
|
13
|
+
def create_uri(path:, parts: :all, ssl: true)
|
14
|
+
file_path = file_path_from_parts(path: path, parts: parts)
|
15
|
+
"#{adapter_prefix(ssl: ssl)}#{file_path}"
|
16
|
+
end
|
17
|
+
|
18
|
+
def adapter_prefix(ssl: true)
|
19
|
+
ssl ? "https://" : "http://"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
delegate :config, to: DerivativeRodeo
|
24
|
+
|
25
|
+
def with_existing_tmp_path(&block)
|
26
|
+
with_tmp_path(lambda { |_file_path, tmp_file_path, exist|
|
27
|
+
raise Errors::FileMissingError unless exist
|
28
|
+
|
29
|
+
response = get(file_uri)
|
30
|
+
File.open(tmp_file_path, 'wb') { |fp| fp.write(response.body) }
|
31
|
+
}, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# Implemented to complete the interface.
|
36
|
+
#
|
37
|
+
# @raise [NotImplementedError]
|
38
|
+
def write
|
39
|
+
raise "#{self.class}#write is deliberately not implemented"
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# @param url [String]
|
44
|
+
#
|
45
|
+
# @return [String]
|
46
|
+
def read(url)
|
47
|
+
HTTParty.get(url, logger: config.logger)
|
48
|
+
rescue => e
|
49
|
+
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
50
|
+
raise e
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# @param url [String]
|
55
|
+
#
|
56
|
+
# @return [URI] when the URL resolves successfully
|
57
|
+
# @return [FalseClass] when the URL's head request is not successful or we've exhausted our
|
58
|
+
# remaining redirects.
|
59
|
+
def exists?(url)
|
60
|
+
HTTParty.head(url, logger: config.logger)
|
61
|
+
rescue => e
|
62
|
+
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module StorageLocations
|
5
|
+
##
|
6
|
+
# Location for files found on a local disk
|
7
|
+
class FileLocation < BaseLocation
|
8
|
+
def self.create_uri(path:, parts: :all)
|
9
|
+
file_path = file_path_from_parts(path: path, parts: parts)
|
10
|
+
"#{adapter_prefix}#{file_path}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.adapter_prefix
|
14
|
+
"#{scheme}://"
|
15
|
+
end
|
16
|
+
|
17
|
+
def with_existing_tmp_path(&block)
|
18
|
+
with_tmp_path(lambda { |file_path, tmp_file_path, exist|
|
19
|
+
raise Errors::FileMissingError unless exist
|
20
|
+
|
21
|
+
FileUtils.cp(file_path, tmp_file_path)
|
22
|
+
}, &block)
|
23
|
+
end
|
24
|
+
|
25
|
+
def exist?
|
26
|
+
File.exist?(file_path)
|
27
|
+
end
|
28
|
+
|
29
|
+
# write the file to the file_uri
|
30
|
+
def write
|
31
|
+
raise Errors::FileMissingError("Use write within a with_new_tmp_path block and fille the mp file with data before writing") unless File.exist?(tmp_file_path)
|
32
|
+
|
33
|
+
FileUtils.mkdir_p(file_dir)
|
34
|
+
FileUtils.cp_r(tmp_file_path, file_path)
|
35
|
+
file_uri
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'derivative_rodeo/storage_locations/concerns/download_concern'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module StorageLocations
|
7
|
+
##
|
8
|
+
# Location for files from the web. Download only, can not write!
|
9
|
+
class HttpLocation < BaseLocation
|
10
|
+
include DownloadConcern
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'derivative_rodeo/storage_locations/concerns/download_concern'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module StorageLocations
|
7
|
+
##
|
8
|
+
# Location for files from the web. Download only, can not write!
|
9
|
+
class HttpsLocation < BaseLocation
|
10
|
+
include DownloadConcern
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|