assembly-objectfile 1.11.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +14 -0
  3. data/.github/pull_request_template.md +3 -5
  4. data/.gitignore +0 -1
  5. data/.rubocop.yml +87 -15
  6. data/.rubocop_todo.yml +19 -74
  7. data/Gemfile +2 -0
  8. data/Gemfile.lock +106 -0
  9. data/README.md +1 -1
  10. data/assembly-objectfile.gemspec +5 -6
  11. data/lib/assembly-objectfile/object_file.rb +253 -3
  12. data/lib/assembly-objectfile/version.rb +2 -2
  13. data/lib/assembly-objectfile.rb +0 -5
  14. data/spec/object_file_spec.rb +411 -167
  15. data/spec/spec_helper.rb +3 -31
  16. data/spec/test_data/empty.txt +0 -0
  17. metadata +35 -121
  18. data/.travis.yml +0 -20
  19. data/lib/assembly-objectfile/content_metadata/config.rb +0 -26
  20. data/lib/assembly-objectfile/content_metadata/file.rb +0 -63
  21. data/lib/assembly-objectfile/content_metadata/file_set.rb +0 -73
  22. data/lib/assembly-objectfile/content_metadata/file_set_builder.rb +0 -65
  23. data/lib/assembly-objectfile/content_metadata/nokogiri_builder.rb +0 -57
  24. data/lib/assembly-objectfile/content_metadata.rb +0 -117
  25. data/lib/assembly-objectfile/object_fileable.rb +0 -278
  26. data/spec/content_metadata_spec.rb +0 -791
  27. data/spec/test_data/input/oo000oo0001/00/oo000oo0001_00_001.tif +0 -0
  28. data/spec/test_data/input/oo000oo0001/00/oo000oo0001_00_002.tif +0 -0
  29. data/spec/test_data/input/oo000oo0001/05/oo000oo0001_05_001.jp2 +0 -0
  30. data/spec/test_data/input/oo000oo0001/05/oo000oo0001_05_002.jp2 +0 -0
  31. data/spec/test_data/input/oo000oo0001/15/oo000oo0001_15_001.pdf +0 -1
  32. data/spec/test_data/input/oo000oo0001/15/oo000oo0001_15_002.pdf +0 -1
  33. data/spec/test_data/input/oo000oo0001/31/oo000oo0001_31_001.pdf +0 -1
  34. data/spec/test_data/input/oo000oo0001/50/oo000oo0001_50_001.tif +0 -0
  35. data/spec/test_data/input/oo000oo0001/oo000oo0001_book.pdf +0 -1
  36. data/spec/test_data/input/res1_image1.jp2 +0 -0
  37. data/spec/test_data/input/res1_image2.jp2 +0 -0
  38. data/spec/test_data/input/res1_image2.tif +0 -0
  39. data/spec/test_data/input/res1_teifile.txt +0 -1
  40. data/spec/test_data/input/res2_image1.jp2 +0 -0
  41. data/spec/test_data/input/res2_image1.tif +0 -0
  42. data/spec/test_data/input/res2_image2.jp2 +0 -0
  43. data/spec/test_data/input/res2_image2.tif +0 -0
  44. data/spec/test_data/input/res2_teifile.txt +0 -1
  45. data/spec/test_data/input/res2_textfile.txt +0 -1
  46. data/spec/test_data/input/res3_image1.jp2 +0 -0
  47. data/spec/test_data/input/res3_image1.tif +0 -0
  48. data/spec/test_data/input/res3_teifile.txt +0 -1
  49. data/spec/test_data/input/test.pdf +0 -1
  50. data/spec/test_data/input/test2.jp2 +0 -0
  51. data/spec/test_data/input/test2.tif +0 -0
@@ -1,117 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
- require 'deprecation'
5
- require 'active_support'
6
- require 'assembly-objectfile/content_metadata/file'
7
- require 'assembly-objectfile/content_metadata/file_set'
8
- require 'assembly-objectfile/content_metadata/file_set_builder'
9
- require 'assembly-objectfile/content_metadata/config'
10
- require 'assembly-objectfile/content_metadata/nokogiri_builder'
11
-
12
- module Assembly
13
- SPECIAL_DPG_FOLDERS = %w[31 44 50].freeze # these special dpg folders will force any files contained in them into their own resources, regardless of filenaming convention
14
- # these are used when :bundle=>:dpg only
15
-
16
- DEPRECATED_STYLES = %i[book_with_pdf book_as_image].freeze
17
- VALID_STYLES = %i[simple_image simple_book file map document 3d webarchive-seed].freeze
18
-
19
- # This class generates content metadata for image files
20
- class ContentMetadata
21
- # Generates image content XML metadata for a repository object.
22
- # This method only produces content metadata for images
23
- # and does not depend on a specific folder structure. Note that it is class level method.
24
- #
25
- # @param [Hash] params a hash containg parameters needed to produce content metadata
26
- # :druid = required - a string of druid of the repository object's druid id (with or without 'druid:' prefix)
27
- # :objects = required - an array of Assembly::ObjectFile objects containing the list of files to add to content metadata
28
- # NOTE: if you set the :bundle option to :prebundled, you will need to pass in an array of arrays, and not a flat array, as noted below
29
- # :style = optional - a symbol containing the style of metadata to create, allowed values are
30
- # :simple_image (default), contentMetadata type="image", resource type="image"
31
- # :file, contentMetadata type="file", resource type="file"
32
- # :simple_book, contentMetadata type="book", resource type="page", but any resource which has file(s) other than an image, and also contains no images at all, will be resource type="object"
33
- # :book_with_pdf, contentMetadata type="book", resource type="page", but any resource which has any file(s) other than an image will be resource type="object" - NOTE: THIS IS DEPRECATED
34
- # :book_as_image, as simple_book, but with contentMetadata type="book", resource type="image" (same rule applies for resources with non images) - NOTE: THIS IS DEPRECATED
35
- # :map, like simple_image, but with contentMetadata type="map", resource type="image"
36
- # :3d, contentMetadata type="3d", ".obj" and other configured 3d extension files go into resource_type="3d", everything else into resource_type="file"
37
- # :webarchive-seed, contentMetadata type="webarchive-seed", resource type="image"
38
- # :bundle = optional - a symbol containing the method of bundling files into resources, allowed values are
39
- # :default = all files get their own resources (default)
40
- # :filename = files with the same filename but different extensions get bundled together in a single resource
41
- # :dpg = files representing the same image but of different mimetype that use the SULAIR DPG filenaming standard (00 vs 05) get bundled together in a single resource
42
- # :prebundlded = this option requires you to prebundled the files passed in as an array of arrays, indicating how files are bundlded into resources; this is the most flexible option since it gives you full control
43
- # :add_exif = optional - a boolean to indicate if exif data should be added (mimetype, filesize, image height/width, etc.) to each file, defaults to false and is not required if project goes through assembly
44
- # :add_file_attributes = optional - a boolean to indicate if publish/preserve/shelve/role attributes should be added using defaults or by supplied override by mime/type, defaults to false and is not required if project goes through assembly
45
- # :file_attributes = optional - a hash of file attributes by mimetype to use instead of defaults, only used if add_file_attributes is also true,
46
- # If a mimetype match is not found in your hash, the default is used (either your supplied default or the gems).
47
- # e.g. {'default'=>{:preserve=>'yes',:shelve=>'yes',:publish=>'yes'},'image/tif'=>{:preserve=>'yes',:shelve=>'no',:publish=>'no'},'application/pdf'=>{:preserve=>'yes',:shelve=>'yes',:publish=>'yes'}}
48
- # :include_root_xml = optional - a boolean to indicate if the contentMetadata returned includes a root <?xml version="1.0"?> tag, defaults to true
49
- # :preserve_common_paths = optional - When creating the file "id" attribute, content metadata uses the "relative_path" attribute of the ObjectFile objects passed in. If the "relative_path" attribute is not set, the "path" attribute is used instead,
50
- # which includes a full path to the file. If the "preserve_common_paths" parameter is set to false or left off, then the common paths of all of the ObjectFile's passed in are removed from any "path" attributes. This should turn full paths into
51
- # the relative paths that are required in content metadata file id nodes. If you do not want this behavior, set "preserve_common_paths" to true. The default is false.
52
- # :flatten_folder_structure = optional - Will remove *all* folder structure when genearting file IDs (e.g. DPG subfolders like '00','05' will be removed) when generating file IDs. This is useful if the folder structure is flattened when staging files (like for DPG).
53
- # The default is false. If set to true, will override the "preserve_common_paths" parameter.
54
- # :auto_labels = optional - Will add automated resource labels (e.g. "File 1") when labels are not provided by the user. The default is true.
55
- # See https://consul.stanford.edu/pages/viewpage.action?spaceKey=chimera&title=DOR+content+types%2C+resource+types+and+interpretive+metadata for next two settings
56
- # :reading_order = optional - only valid for simple_book, can be 'rtl' or 'ltr'. The default is 'ltr'.
57
- # Example:
58
- # Assembly::ContentMetadata.create_content_metadata(:druid=>'druid:nx288wh8889',:style=>:simple_image,:objects=>object_files,:add_file_attributes=>false)
59
- def self.create_content_metadata(druid:, objects:, auto_labels: true,
60
- add_exif: false, bundle: :default, style: :simple_image,
61
- add_file_attributes: false, file_attributes: {},
62
- preserve_common_paths: false, flatten_folder_structure: false,
63
- include_root_xml: nil, reading_order: 'ltr')
64
-
65
- common_path = find_common_path(objects) unless preserve_common_paths # find common paths to all files provided if needed
66
-
67
- filesets = FileSetBuilder.build(bundle: bundle, objects: objects, style: style)
68
- config = Config.new(auto_labels: auto_labels,
69
- flatten_folder_structure: flatten_folder_structure,
70
- add_file_attributes: add_file_attributes,
71
- file_attributes: file_attributes,
72
- add_exif: add_exif,
73
- reading_order: reading_order,
74
- type: object_level_type(style))
75
-
76
- builder = NokogiriBuilder.build(druid: druid,
77
- filesets: filesets,
78
- common_path: common_path,
79
- config: config)
80
-
81
- if include_root_xml == false
82
- builder.doc.root.to_xml
83
- else
84
- builder.to_xml
85
- end
86
- end
87
-
88
- def self.special_dpg_folder?(folder)
89
- SPECIAL_DPG_FOLDERS.include?(folder)
90
- end
91
-
92
- def self.find_common_path(objects)
93
- all_paths = objects.flatten.map do |obj|
94
- raise "File '#{obj.path}' not found" unless obj.file_exists?
95
-
96
- obj.path # collect all of the filenames into an array
97
- end
98
-
99
- Assembly::ObjectFile.common_path(all_paths) # find common paths to all files provided if needed
100
- end
101
- private_class_method :find_common_path
102
-
103
- def self.object_level_type(style)
104
- Deprecation.warn(self, "the style #{style} is now deprecated and should not be used. This will be removed in assembly-objectfile 2.0") if DEPRECATED_STYLES.include? style
105
- raise "Supplied style (#{style}) not valid" unless (VALID_STYLES + DEPRECATED_STYLES).include? style
106
-
107
- case style
108
- when :simple_image
109
- 'image'
110
- when :simple_book, :book_with_pdf, :book_as_image
111
- 'book'
112
- else
113
- style.to_s
114
- end
115
- end
116
- end # class
117
- end # module
@@ -1,278 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'mini_exiftool'
4
- require 'mime/types'
5
-
6
- module Assembly
7
- # Common behaviors we need for other classes in the gem
8
- module ObjectFileable
9
- attr_accessor :file_attributes, :label, :path, :provider_md5, :provider_sha1, :relative_path, :mime_type_order
10
-
11
- VALID_MIMETYPE_METHODS = %i[override exif file extension].freeze
12
-
13
- # @param [String] path full path to the file to be worked with
14
- # @param [Hash<Symbol => Object>] params options used during content metadata generation
15
- # @option params [Hash<Symbol => ['yes', 'no']>] :file_attributes e.g. {:preserve=>'yes',:shelve=>'no',:publish=>'no'}, defaults pulled from mimetype
16
- # @option params [String] :label a resource label (files bundlded together will just get the first file's label attribute if set)
17
- # @option params [String] :provider_md5 pre-computed MD5 checksum
18
- # @option params [String] :provider_sha1 pre-computed SHA1 checksum
19
- # @option params [String] :relative_path if you want the file ids in the content metadata it can be set, otherwise content metadata will get the full path
20
- # @option params [Array] :mime_type_order can be set to the order in which you want mimetypes to be determined
21
- # options are :override (from manual overide mapping if exists), :exif (from exif if exists),
22
- # :extension (from file extension), and :file (from unix file system command)
23
- # the default is defined in the private `default_mime_type_order` method but you can override to set your own order
24
- # @example
25
- # Assembly::ObjectFile.new('/input/path_to_file.tif')
26
- def initialize(path, params = {})
27
- @path = path
28
- @label = params[:label]
29
- @file_attributes = params[:file_attributes]
30
- @relative_path = params[:relative_path]
31
- @provider_md5 = params[:provider_md5]
32
- @provider_sha1 = params[:provider_sha1]
33
- @mime_type_order = params[:mime_type_order] || default_mime_type_order
34
- end
35
-
36
- # @return [String] DPG base filename, removing the extension and the '00','05', etc. placeholders
37
- # @example
38
- # source_file = Assembly::ObjectFile.new('/input/cy565rm7188_00_001.tif')
39
- # puts source_file.dpg_basename # "cy565rm7188_001"
40
- def dpg_basename
41
- file_parts = File.basename(path, ext).split('_')
42
- file_parts.size == 3 ? "#{file_parts[0]}_#{file_parts[2]}" : filename_without_ext
43
- end
44
-
45
- # @return [String] DPG subfolder for the given filename, i.e. '00','05', etc.
46
- # @example
47
- # source_file = Assembly::ObjectFile.new('/input/cy565rm7188_00_001.tif')
48
- # puts source_file.dpg_folder # "00"
49
- def dpg_folder
50
- file_parts = File.basename(path, ext).split('_')
51
- file_parts.size == 3 ? file_parts[1] : ''
52
- end
53
-
54
- # @return [String] base filename
55
- # @example
56
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
57
- # puts source_file.filename # "path_to_file.tif"
58
- def filename
59
- File.basename(path)
60
- end
61
-
62
- # @return [String] base directory
63
- # @example
64
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
65
- # puts source_file.dirname # "/input"
66
- def dirname
67
- File.dirname(path)
68
- end
69
-
70
- # @return [String] filename extension
71
- # @example
72
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
73
- # puts source_file.ext # ".tif"
74
- def ext
75
- File.extname(path)
76
- end
77
-
78
- # @return [String] base filename without extension
79
- # @example
80
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
81
- # puts source_file.filename # "path_to_file"
82
- def filename_without_ext
83
- File.basename(path, ext)
84
- end
85
-
86
- # @return [MiniExiftool] exif information stored as a hash and an object
87
- # @example
88
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
89
- # puts source_file.exif # hash with exif information
90
- def exif
91
- @exif ||= begin
92
- check_for_file
93
- MiniExiftool.new(path, replace_invalid_chars: '?')
94
- rescue StandardError
95
- nil
96
- end
97
- end
98
-
99
- # Computes md5 checksum or returns cached value
100
- # @return [String] md5 checksum
101
- # @example
102
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
103
- # puts source_file.md5 # 'XXX123XXX1243XX1243'
104
- def md5
105
- check_for_file unless @md5
106
- @md5 ||= Digest::MD5.file(path).hexdigest
107
- end
108
-
109
- # Computes sha1 checksum or return cached value
110
- # @return [String] sha1 checksum
111
- # @example
112
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
113
- # puts source_file.sha1 # 'XXX123XXX1243XX1243'
114
- def sha1
115
- check_for_file unless @sha1
116
- @sha1 ||= Digest::SHA1.file(path).hexdigest
117
- end
118
-
119
- # Returns mimetype information for the current file based on the ordering set in default_mime_type_order
120
- # We stop computing mimetypes as soon as we have a method that returns a value
121
- # @return [String] mime type
122
- # @example
123
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.txt')
124
- # puts source_file.mimetype # 'text/plain'
125
- def mimetype
126
- @mimetype ||= begin
127
- check_for_file
128
- mimetype = ''
129
- mime_type_order.each do |mime_type_method|
130
- mimetype = public_send("#{mime_type_method}_mimetype") if VALID_MIMETYPE_METHODS.include?(mime_type_method)
131
- break if mimetype.present?
132
- end
133
- mimetype
134
- end
135
- end
136
-
137
- # Returns mimetype information using the manual override mapping (based on a file extension lookup)
138
- # @return [String] mime type for supplied file if a mapping exists for the file's extension
139
- # @example
140
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.json')
141
- # puts source_file.override_mimetype # 'application/json'
142
- def override_mimetype
143
- @override_mimetype ||= Assembly::OVERRIDE_MIMETYPES.fetch(ext.to_sym, '')
144
- end
145
-
146
- # Returns mimetype information using the mime-types gem (based on a file extension lookup)
147
- # @return [String] mime type for supplied file
148
- # @example
149
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.txt')
150
- # puts source_file.extension_mimetype # 'text/plain'
151
- def extension_mimetype
152
- @extension_mimetype ||= begin
153
- mtype = MIME::Types.type_for(path).first
154
- mtype ? mtype.content_type : ''
155
- end
156
- end
157
-
158
- # Returns mimetype information for the current file based on unix file system command.
159
- # @return [String] mime type for supplied file
160
- # @example
161
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.txt')
162
- # puts source_file.file_mimetype # 'text/plain'
163
- def file_mimetype
164
- @file_mimetype ||= begin
165
- check_for_file
166
- `file --mime-type "#{path}"`.delete("\n").split(':')[1].strip # first try and get the mimetype from the unix file command
167
- end
168
- end
169
-
170
- # Returns mimetype information for the current file based on exif data (if available and not a trusted source that we'd rather get from the file system command)
171
- # @return [String] mime type for supplied file
172
- # @example
173
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.txt')
174
- # puts source_file.exif_mimetype # 'text/plain'
175
- def exif_mimetype
176
- @exif_mimetype ||= begin
177
- check_for_file
178
- prefer_exif = !Assembly::TRUSTED_MIMETYPES.include?(file_mimetype) # if it's not a "trusted" mimetype and there is exif data; get the mimetype from the exif
179
- exif.mimetype if
180
- exif&.mimetype && prefer_exif
181
- end
182
- end
183
-
184
- # @note Uses shell call to "file", only expected to work on unix based systems
185
- # @return [String] encoding for supplied file
186
- # @example
187
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.txt')
188
- # puts source_file.encoding # 'us-ascii'
189
- def encoding
190
- @encoding ||= begin
191
- check_for_file
192
- `file --mime-encoding "#{path}"`.delete("\n").split(':')[1].strip
193
- end
194
- end
195
-
196
- # @return [Symbol] the type of object, could be :application (for PDF or Word, etc), :audio, :image, :message, :model, :multipart, :text or :video
197
- # @example
198
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
199
- # puts source_file.object_type # :image
200
- def object_type
201
- lookup = MIME::Types[mimetype][0]
202
- lookup.nil? ? :other : lookup.media_type.to_sym
203
- end
204
-
205
- # @return [Boolean] if object is an image
206
- # @example
207
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
208
- # puts source_file.image? # true
209
- def image?
210
- object_type == :image
211
- end
212
-
213
- # Examines the input image for validity. Used to determine if image is a valid and useful image.
214
- # If image is not a jp2, also checks if it is jp2able?
215
- # @return [Boolean] true if image is valid, false if not.
216
- # @example
217
- # source_img = Assembly::ObjectFile.new('/input/path_to_file.tif')
218
- # puts source_img.valid_image? # true
219
- def valid_image?
220
- return false unless image?
221
-
222
- mimetype == 'image/jp2' || jp2able?
223
- end
224
-
225
- # @return [Boolean] true if image has a color profile, false if not.
226
- # @example
227
- # source_img = Assembly::ObjectFile.new('/input/path_to_file.tif')
228
- # puts source_img.has_color_profile? # true
229
- def has_color_profile?
230
- return false unless exif
231
-
232
- exif['profiledescription'] || exif['colorspace'] ? true : false
233
- end
234
-
235
- # Examines the input image for validity to create a jp2. Same as valid_image? but also confirms the existence of a profile description and further restricts mimetypes.
236
- # It is used by the assembly robots to decide if a jp2 will be created and is also called before you create a jp2 using assembly-image.
237
- # @return [Boolean] true if image should have a jp2 created, false if not.
238
- # @example
239
- # source_img = Assembly::ObjectFile.new('/input/path_to_file.tif')
240
- # puts source_img.jp2able? # true
241
- def jp2able?
242
- return false unless exif
243
-
244
- Assembly::VALID_IMAGE_MIMETYPES.include?(mimetype)
245
- end
246
-
247
- # Returns file size information for the current file in bytes.
248
- # @return [Integer] file size in bytes
249
- # @example
250
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
251
- # puts source_file.filesize # 1345
252
- def filesize
253
- check_for_file
254
- @filesize ||= File.size(path)
255
- end
256
-
257
- # Determines if the file exists (and is not a directory)
258
- # @return [Boolean] file exists
259
- # @example
260
- # source_file = Assembly::ObjectFile.new('/input/path_to_file.tif')
261
- # puts source_file.file_exists? # true
262
- def file_exists?
263
- @file_exists ||= (File.exist?(path) && !File.directory?(path))
264
- end
265
-
266
- private
267
-
268
- # prive method defining default preferred ordering of how mimetypes are determined
269
- def default_mime_type_order
270
- %i[override exif file extension]
271
- end
272
-
273
- # private method to check for file existence before operating on it
274
- def check_for_file
275
- raise "input file #{path} does not exist" unless file_exists?
276
- end
277
- end
278
- end