iiif_print 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +102 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
  19. data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
  20. data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
  21. data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
  22. data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  23. data/app/listeners/iiif_print/listener.rb +31 -0
  24. data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
  25. data/app/models/concerns/iiif_print/solr/document.rb +19 -3
  26. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  27. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  28. data/app/models/iiif_print/pending_relationship.rb +3 -0
  29. data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
  30. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  31. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  32. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
  33. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  34. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  35. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  36. data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
  37. data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
  38. data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
  39. data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
  40. data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
  41. data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
  42. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  43. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  44. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  45. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  46. data/config/initializers/simple_schema_loader.rb +1 -0
  47. data/config/locales/iiif_print.en.yml +4 -0
  48. data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
  49. data/config/routes.rb +3 -0
  50. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
  51. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
  52. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
  53. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  54. data/docker-compose.yml +2 -2
  55. data/iiif_print.gemspec +11 -10
  56. data/lib/generators/iiif_print/install_generator.rb +21 -1
  57. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  58. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  59. data/lib/iiif_print/base_derivative_service.rb +14 -2
  60. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
  61. data/lib/iiif_print/catalog_search_builder.rb +7 -3
  62. data/lib/iiif_print/configuration.rb +205 -8
  63. data/lib/iiif_print/data/fileset_helper.rb +3 -3
  64. data/lib/iiif_print/data/work_derivatives.rb +4 -4
  65. data/lib/iiif_print/engine.rb +53 -15
  66. data/lib/iiif_print/errors.rb +18 -0
  67. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  68. data/lib/iiif_print/image_tool.rb +12 -8
  69. data/lib/iiif_print/jp2_derivative_service.rb +4 -1
  70. data/lib/iiif_print/lineage_service.rb +47 -13
  71. data/lib/iiif_print/metadata.rb +67 -48
  72. data/lib/iiif_print/pdf_derivative_service.rb +3 -1
  73. data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
  74. data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
  75. data/lib/iiif_print/persistence_layer.rb +118 -0
  76. data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
  77. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
  78. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  79. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
  80. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  81. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  82. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  83. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  84. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  85. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  86. data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
  87. data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
  88. data/lib/iiif_print/tiff_derivative_service.rb +3 -1
  89. data/lib/iiif_print/version.rb +1 -1
  90. data/lib/iiif_print.rb +210 -20
  91. data/lib/samvera/derivatives/configuration.rb +83 -0
  92. data/lib/samvera/derivatives/hyrax.rb +129 -0
  93. data/lib/samvera/derivatives.rb +238 -0
  94. data/tasks/copy_authorities_to_test_app.rake +11 -0
  95. data/tasks/iiif_print_dev.rake +4 -4
  96. metadata +111 -196
  97. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  98. data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
  99. data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
  100. data/bin/rails +0 -13
  101. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
  102. data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
  103. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  104. data/spec/.keep.txt +0 -1
  105. data/spec/factories/ability.rb +0 -6
  106. data/spec/factories/newspaper_issue.rb +0 -7
  107. data/spec/factories/newspaper_page.rb +0 -7
  108. data/spec/factories/newspaper_page_solr_document.rb +0 -12
  109. data/spec/factories/newspaper_title.rb +0 -8
  110. data/spec/factories/uploaded_pdf_file.rb +0 -9
  111. data/spec/factories/uploaded_txt_file.rb +0 -9
  112. data/spec/factories/user.rb +0 -13
  113. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  114. data/spec/fixtures/files/4.1.07.tiff +0 -0
  115. data/spec/fixtures/files/README.md +0 -7
  116. data/spec/fixtures/files/alto-2-0.xsd +0 -714
  117. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  118. data/spec/fixtures/files/credits.md +0 -16
  119. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  120. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  121. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  122. data/spec/fixtures/files/minimal-alto.xml +0 -31
  123. data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
  124. data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
  125. data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
  126. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  127. data/spec/fixtures/files/ocr_alto.xml +0 -202
  128. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
  129. data/spec/fixtures/files/ocr_color.tiff +0 -0
  130. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  131. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  132. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  133. data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
  134. data/spec/fixtures/files/page1.tiff +0 -0
  135. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  136. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  137. data/spec/fixtures/files/thumbnail.jpg +0 -0
  138. data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
  139. data/spec/helpers/iiif_print_helper_spec.rb +0 -43
  140. data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
  141. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
  142. data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
  143. data/spec/iiif_print/configuration_spec.rb +0 -67
  144. data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
  145. data/spec/iiif_print/data/work_file_spec.rb +0 -99
  146. data/spec/iiif_print/data/work_files_spec.rb +0 -237
  147. data/spec/iiif_print/image_tool_spec.rb +0 -109
  148. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
  149. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
  150. data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
  151. data/spec/iiif_print/lineage_service_spec.rb +0 -13
  152. data/spec/iiif_print/metadata_spec.rb +0 -115
  153. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
  154. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
  155. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
  156. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
  157. data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
  158. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
  159. data/spec/iiif_print_spec.rb +0 -51
  160. data/spec/misc_shared.rb +0 -111
  161. data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
  162. data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
  163. data/spec/models/solr_document_spec.rb +0 -14
  164. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
  165. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
  166. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
  167. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
  168. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
  169. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
  170. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
  171. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
  172. data/spec/spec_helper.rb +0 -181
  173. data/spec/support/controller_level_helpers.rb +0 -28
  174. data/spec/support/iiif_print_models.rb +0 -127
  175. data/spec/test_app_templates/blacklight.yml +0 -9
  176. data/spec/test_app_templates/fedora.yml +0 -15
  177. data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
  178. data/spec/test_app_templates/redis.yml +0 -9
  179. data/spec/test_app_templates/solr/conf/schema.xml +0 -362
  180. data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
  181. data/spec/test_app_templates/solr.yml +0 -7
@@ -8,78 +8,83 @@ module IiifPrint
8
8
  # For dpi extraction, falls back to calculating using MiniMagick,
9
9
  # if neccessary.
10
10
  class PdfImageExtractionService
11
- # class constant column numbers
12
- COL_WIDTH = 3
13
- COL_HEIGHT = 4
14
- COL_COLOR = 5
15
- COL_CHANNELS = 6
16
- COL_BITS = 7
17
- # only poppler 0.25+ has this column in output:
18
- COL_XPPI = 12
19
-
20
11
  def initialize(path)
21
12
  @path = path
22
- @cmd = format('pdfimages -list %<path>s', path: path)
23
- @output = nil
24
- @entries = nil
13
+ process(command: format('pdfimages -list %<path>s 2>/dev/null', path: path))
25
14
  end
26
15
 
27
- def process
28
- # call just once
29
- if @output.nil?
30
- Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
- @output = stdout.read.split("\n")
32
- end
33
- end
34
- @output.slice(2, @output.size - 1)
35
- end
16
+ attr_reader :path, :page_count, :width, :height, :pixels_per_inch
17
+ alias ppi pixels_per_inch
36
18
 
37
- def entries
38
- if @entries.nil?
39
- @entries = []
40
- output = process
41
- (0..output.size - 1).each do |i|
42
- @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
- end
44
- end
45
- @entries
19
+ # @return [Array<String, Integer, Integer>]
20
+ def color
21
+ [@color_description, @channels, @bits]
46
22
  end
47
23
 
48
- def selectcolumn(i, &block)
49
- result = entries.map { |e| e[i] }
50
- return result.map!(&block) if block_given?
51
- result
52
- end
24
+ private
53
25
 
54
- def width
55
- selectcolumn(COL_WIDTH, &:to_i).max
56
- end
26
+ # class constant column numbers
27
+ COL_WIDTH = 3
28
+ COL_HEIGHT = 4
29
+ COL_COLOR_DESC = 5
30
+ COL_CHANNELS = 6
31
+ COL_BITS = 7
32
+ # only poppler 0.25+ has this column in output:
33
+ COL_XPPI = 12
57
34
 
58
- def height
59
- selectcolumn(COL_HEIGHT, &:to_i).max
60
- end
35
+ # rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
36
+ # rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
37
+ # rubocop:disable Metrics/CyclomaticComplexity
38
+ # rubocop:disable Metrics/PerceivedComplexity
39
+ #
40
+ # The first two lines are tabular header information:
41
+ #
42
+ # Example:
43
+ #
44
+ # bash-5.1$ pdfimages -list fmc_color.pdf | head -5
45
+ # page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio
46
+ # --------------------------------------------------------------------------------------------
47
+ # 1 0 image 2475 413 rgb 3 8 jpeg no 10 0 300 300 21.8K 0.7%
48
+ def process(command:)
49
+ @page_count = 0
50
+ @color_description = 'gray'
51
+ @width = 0
52
+ @height = 0
53
+ @channels = 0
54
+ @bits = 0
55
+ @pixels_per_inch = 0
56
+ Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
57
+ stdout.read.split("\n").each_with_index do |line, index|
58
+ # Skip the two header lines
59
+ next if index <= 1
60
+ @page_count += 1
61
+ cells = line.gsub(/\s+/m, ' ').strip.split(" ")
61
62
 
62
- def color
63
- # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
- # so caller may want all of this information, and in case of
65
- # mixed color spaces across images, this returns maximum
66
- desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
- channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
- bits = entries.map { |e| e[COL_BITS].to_i }.max
69
- [desc, channels, bits]
70
- end
63
+ @color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
64
+ @width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > @width
65
+ @height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > @height
66
+ @channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > @channels
67
+ @bits = cells[COL_BITS].to_i if cells[COL_BITS].to_i > @bits
71
68
 
72
- def ppi
73
- if entries[0].size <= 12
74
- # poppler < 0.25
75
- pdf = MiniMagick::Image.open(@path)
76
- width_points = pdf.width
77
- width_px = width
78
- return (72 * width_px / width_points).to_i
69
+ # In the case of poppler version < 0.25, we will have no more than 12 columns. As such,
70
+ # we need to do some alternative magic to calculate this.
71
+ if @page_count == 1 && cells.size <= 12
72
+ pdf = MiniMagick::Image.open(@path)
73
+ width_points = pdf.width
74
+ width_px = width
75
+ @pixels_per_inch = (72 * width_px / width_points).to_i
76
+ elsif cells[COL_XPPI].to_i > @pixels_per_inch
77
+ # By the magic of nil#to_i if we don't have more than 12 columns, we've already set
78
+ # the @pixels_per_inch and this line won't due much of anything.
79
+ @pixels_per_inch = cells[COL_XPPI].to_i
80
+ end
81
+ end
79
82
  end
80
- # with poppler 0.25+, pdfimages just gives us this:
81
- selectcolumn(COL_XPPI, &:to_i).max
82
83
  end
84
+ # rubocop:enable Metrics/AbcSize
85
+ # rubocop:enable Metrics/MethodLength
86
+ # rubocop:enable Metrics/CyclomaticComplexity
87
+ # rubocop:enable Metrics/PerceivedComplexity
83
88
  end
84
89
  end
85
90
  end
@@ -84,6 +84,7 @@ module IiifPrint
84
84
  # add trailing space to plaintext buffer for between words:
85
85
  @text += ' '
86
86
  @words.push(@current) if word_complete?
87
+ @current = nil # clear the current word
87
88
  end
88
89
 
89
90
  def end_line
@@ -120,9 +121,12 @@ module IiifPrint
120
121
  # for current word, and append line endings to plain text:
121
122
  #
122
123
  # @param name [String] element name.
123
- def end_element(_name)
124
- end_line if @element_class_name == 'ocr_line'
125
- end_word if @element_class_name == 'ocrx_word'
124
+ def end_element(name)
125
+ if name == 'span'
126
+ end_word if @element_class_name == 'ocrx_word'
127
+ @text += "\n" if @element_class_name.nil?
128
+ end
129
+ @element_class_name = nil
126
130
  end
127
131
 
128
132
  # Callback for completion of parsing hOCR, used to normalize generated
@@ -9,7 +9,7 @@ module IiifPrint
9
9
  class PageOCR
10
10
  attr_accessor :html, :path
11
11
 
12
- def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
12
+ def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
13
13
  @path = path
14
14
  # hOCR html:
15
15
  @html = nil
@@ -17,13 +17,14 @@ module IiifPrint
17
17
  @source_meta = nil
18
18
  @box = nil
19
19
  @plain = nil
20
- @additional_tessearct_options = additional_tessearct_options
20
+ @additional_tesseract_options = additional_tesseract_options
21
21
  end
22
22
 
23
23
  def run_ocr
24
24
  outfile = File.join(Dir.mktmpdir, 'output_html')
25
- cmd = "tesseract #{path} #{outfile} hocr"
26
- cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
25
+ cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
26
+ cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
27
+ cmd += " hocr"
27
28
  `#{cmd}`
28
29
  outfile + '.hocr'
29
30
  end
@@ -28,13 +28,15 @@ module IiifPrint
28
28
 
29
29
  ocr_derivatives.each do |extension, method_name|
30
30
  path = prepare_path(extension.to_s)
31
- write(content: ocr.public_send(method_name), path: path)
31
+ write(content: ocr.public_send(method_name), path: path, extension: extension)
32
32
  end
33
33
  end
34
34
 
35
- def write(content:, path:)
35
+ def write(content:, path:, extension:)
36
+ mime_type = mime_type_for(extension)
36
37
  File.open(path, 'w') do |outfile|
37
38
  outfile.write(content)
39
+ IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type })
38
40
  end
39
41
  end
40
42
 
@@ -4,9 +4,10 @@ module IiifPrint
4
4
  # NOTE: to keep this from conflicting with TextExtractionDerivativeService,
5
5
  # this class should be invoked by it, not PluggableDerivativeService.
6
6
  class TextFormatsFromALTOService < BaseDerivativeService
7
- self.target_extension = 'tiff'.freeze
7
+ self.target_extension = 'txt'.freeze
8
8
 
9
9
  def save_derivative(destination, data)
10
+ mime_type = mime_type_for(destination)
10
11
  # Load/prepare base of "pairtree" dir structure for extension, fileset
11
12
  prepare_path(destination)
12
13
  #
@@ -17,6 +18,7 @@ module IiifPrint
17
18
  # Write data as UTF-8 encoded text
18
19
  File.open(save_path, "w:UTF-8") do |f|
19
20
  f.write(data)
21
+ IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'extracted_text', mime_type: mime_type })
20
22
  end
21
23
  end
22
24
 
@@ -32,7 +32,9 @@ module IiifPrint
32
32
  source_path += '[0]' if @source_path.ends_with?('pdf')
33
33
  template = use_color? ? COLOR_CMD : GRAY_CMD
34
34
  template = MONO_CMD if one_bit?
35
- format(template, source_file: source_path, out_file: @dest_path)
35
+ data = format(template, source_file: source_path, out_file: @dest_path)
36
+ IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'service_file', mime_type: mime_type_for(target_extension) })
37
+ data
36
38
  end
37
39
 
38
40
  def create_derivatives(filename)
@@ -1,3 +1,3 @@
1
1
  module IiifPrint
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '2.0.0'.freeze
3
3
  end
data/lib/iiif_print.rb CHANGED
@@ -14,13 +14,16 @@ require "iiif_print/tiff_derivative_service"
14
14
  require "iiif_print/lineage_service"
15
15
  require "iiif_print/metadata"
16
16
  require "iiif_print/works_controller_behavior"
17
- require "iiif_print/jobs/application_job"
18
17
  require "iiif_print/blacklight_iiif_search/annotation_decorator"
19
- require "iiif_print/jobs/child_works_from_pdf_job"
20
- require "iiif_print/jobs/create_relationships_job"
21
- require "iiif_print/split_pdfs/pages_into_images_service"
18
+ require "iiif_print/split_pdfs/base_splitter"
22
19
  require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
20
+ require "iiif_print/split_pdfs/derivative_rodeo_splitter"
21
+ require "iiif_print/split_pdfs/destroy_pdf_child_works_service"
22
+ require "iiif_print/persistence_layer"
23
+ require "iiif_print/persistence_layer/active_fedora_adapter"
24
+ require "iiif_print/persistence_layer/valkyrie_adapter"
23
25
 
26
+ # rubocop:disable Metrics/ModuleLength
24
27
  module IiifPrint
25
28
  extend ActiveSupport::Autoload
26
29
  autoload :Configuration
@@ -28,9 +31,10 @@ module IiifPrint
28
31
 
29
32
  ##
30
33
  # @api public
34
+ #
31
35
  # Exposes the IiifPrint configuration.
32
36
  #
33
- # @yield [IiifPrint::Configuration] if a block is passed
37
+ # @yieldparam [IiifPrint::Configuration] config if a block is passed
34
38
  # @return [IiifPrint::Configuration]
35
39
  # @see IiifPrint::Configuration for configuration options
36
40
  def self.config(&block)
@@ -39,28 +43,64 @@ module IiifPrint
39
43
  @config
40
44
  end
41
45
 
46
+ class << self
47
+ delegate(
48
+ :persistence_adapter,
49
+ :skip_splitting_pdf_files_that_end_with_these_texts,
50
+ to: :config
51
+ )
52
+
53
+ delegate(
54
+ :clean_for_tests!,
55
+ :copy_derivatives_from_data_store,
56
+ :create_relationship_between,
57
+ :destroy_children_split_from,
58
+ :extract_text_for,
59
+ :find_by,
60
+ :find_by_title_for,
61
+ :grandparent_for,
62
+ :index_works,
63
+ :object_in_works,
64
+ :object_ordered_works,
65
+ :parent_for,
66
+ :pdf?,
67
+ :save,
68
+ :solr_construct_query,
69
+ :solr_name,
70
+ :solr_query,
71
+ to: :persistence_adapter
72
+ )
73
+ end
74
+
75
+ # NOTE: We use lambdas so we can have default values but also provide a lazy configuration.
76
+ # There are certainly better ways but this is the least intrusive refactor from prior state.
42
77
  DEFAULT_MODEL_CONFIGURATION = {
43
78
  # Split a PDF into individual page images and create a new child work for each image.
44
- pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
45
- pdf_splitter_service: IiifPrint::SplitPdfs::PagesIntoImagesService,
46
- derivative_service_plugins: [
47
- IiifPrint::JP2DerivativeService,
48
- IiifPrint::PDFDerivativeService,
49
- IiifPrint::TextExtractionDerivativeService,
50
- IiifPrint::TIFFDerivativeService
51
- ]
79
+ pdf_splitter_job: -> { IiifPrint::Jobs::ChildWorksFromPdfJob },
80
+ pdf_splitter_service: -> { IiifPrint::SplitPdfs::PagesToJpgsSplitter },
81
+ derivative_service_plugins: lambda {
82
+ [
83
+ IiifPrint::TextExtractionDerivativeService
84
+ ]
85
+ }
52
86
  }.freeze
53
87
 
54
88
  # This is the record level configuration for PDF split handling.
55
89
  ModelConfig = Struct.new(:pdf_split_child_model, *DEFAULT_MODEL_CONFIGURATION.keys, keyword_init: true)
90
+ private_constant :ModelConfig
56
91
 
57
- # This method is responsible for assisting in the configuration of a "model".
92
+ ##
93
+ # @api public
94
+ # This method is responsible for configuring a model for additional derivative generation.
58
95
  #
59
96
  # @example
60
97
  # class Book < ActiveFedora::Base
61
98
  # include IiifPrint.model_configuration(
62
99
  # pdf_split_child_model: Page,
63
100
  # derivative_service_plugins: [
101
+ # IiifPrint::JP2DerivativeService,
102
+ # IiifPrint::PDFDerivativeService,
103
+ # IiifPrint::TextExtractionDerivativeService,
64
104
  # IiifPrint::TIFFDerivativeService
65
105
  # ]
66
106
  # )
@@ -68,29 +108,73 @@ module IiifPrint
68
108
  #
69
109
  # @param kwargs [Hash<Symbol,Object>] the configuration values that overrides the
70
110
  # DEFAULT_MODEL_CONFIGURATION.
111
+ # @option kwargs [Array<Class>] derivative_service_plugins the various derivatives to run on the
112
+ # "original" files associated with this work. Options include:
113
+ # {IiifPrint::JP2DerivativeService}, {IiifPrint::PDFDerivativeService},
114
+ # {IiifPrint::TextExtractionDerivativeService}, {IiifPrint::TIFFDerivativeService}
115
+ # @option kwargs [Class] pdf_splitter_job responsible for handling the splitting of the original file
116
+ # @option kwargs [Class] pdf_split_child_model when we split the file into pages, what's the child model
117
+ # we want for those pages? Often times this is likely the same model as the parent.
118
+ # @option kwargs [Class] pdf_splitter_service the specific service that splits the PDF. Options are:
119
+ # {IiifPrint::SplitPdfs::PagesToJpgsSplitter},
120
+ # {IiifPrint::SplitPdfs::PagesToTiffsSplitter},
121
+ # {IiifPrint::SplitPdfs::PagesToPngsSplitter},
122
+ # {IiifPrint::SplitPdfs::DerivativeRodeoSplitter}
71
123
  #
72
124
  # @return [Module]
73
125
  #
74
126
  # @see IiifPrint::DEFAULT_MODEL_CONFIGURATION
75
127
  # @todo Because not every job will split PDFs and write to a child model. May want to introduce
76
128
  # an alternative splitting method to create new filesets on the existing work instead of new child works.
129
+ # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
77
130
  def self.model_configuration(**kwargs)
78
131
  Module.new do
79
- def iiif_print_config?
80
- true
132
+ extend ActiveSupport::Concern
133
+
134
+ included do
135
+ work_type = self # In this case self is the class we're mixing the new module into.
136
+
137
+ # Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
138
+ indexer = if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
139
+ IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_with_adapter_logic(work_type: work_type)
140
+ elsif work_type < ActiveFedora::Base
141
+ IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_with_adapter_logic(work_type: work_type)
142
+ else
143
+ raise "Unable to mix '.model_configuration' into #{work_type}"
144
+ end
145
+
146
+ # Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
147
+ if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
148
+ IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_form_with_adapter_logic(work_type: work_type)
149
+ elsif work_type < ActiveFedora::Base
150
+ IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_form_with_adapter_logic(work_type: work_type)
151
+ else
152
+ raise "Unable to mix '.model_configuration' into #{work_type}"
153
+ end
154
+
155
+ # Deriving lineage of objects is a potentially complicated thing. We provide a default
156
+ # service but each work_type's indexer can be configured by amending it's
157
+ # {.iiif_print_lineage_service}.
158
+ indexer.class_attribute(:iiif_print_lineage_service, default: IiifPrint::LineageService) unless indexer.respond_to?(:iiif_print_lineage_service)
159
+ work_type::GeneratedResourceSchema.send(:include, IiifPrint::SetChildFlag) if work_type.const_defined?(:GeneratedResourceSchema)
81
160
  end
82
161
 
83
162
  # We don't know what you may want in your configuration, but from this gems implementation,
84
163
  # we're going to provide the defaults to ensure that it works.
85
164
  DEFAULT_MODEL_CONFIGURATION.each_pair do |key, default_value|
86
- kwargs[key] ||= default_value
165
+ kwargs[key] ||= default_value.call
87
166
  end
88
167
 
89
168
  define_method(:iiif_print_config) do
90
169
  @iiif_print_config ||= ModelConfig.new(**kwargs)
91
170
  end
171
+
172
+ def iiif_print_config?
173
+ true
174
+ end
92
175
  end
93
176
  end
177
+ # rubocop:enable Metrics/MethodLength
94
178
 
95
179
  # @api public
96
180
  #
@@ -107,7 +191,7 @@ module IiifPrint
107
191
  # @see Hyrax::IiifManifestPresenter#manifest_metadata
108
192
  def self.manifest_metadata_for(work:,
109
193
  version: config.default_iiif_manifest_version,
110
- fields: default_fields_for(work),
194
+ fields: defined?(AllinsonFlex) ? fields_for_allinson_flex : default_fields,
111
195
  current_ability:,
112
196
  base_url:)
113
197
  Metadata.build_metadata_for(work: work,
@@ -117,6 +201,11 @@ module IiifPrint
117
201
  base_url: base_url)
118
202
  end
119
203
 
204
+ def self.manifest_metadata_from(work:, presenter:)
205
+ current_ability = presenter.try(:ability) || presenter.try(:current_ability)
206
+ base_url = presenter.try(:base_url) || presenter.try(:request)&.base_url
207
+ IiifPrint.manifest_metadata_for(work: work, current_ability: current_ability, base_url: base_url)
208
+ end
120
209
  # Hash is an arbitrary attribute key/value pairs
121
210
  # Struct is a defined set of attribute "keys". When we favor defined values,
122
211
  # then we are naming the concept and defining the range of potential values.
@@ -124,13 +213,114 @@ module IiifPrint
124
213
 
125
214
  # @api private
126
215
  # @todo Figure out a way to use a custom label, right now it takes it get rendered from the title.
127
- def self.default_fields_for(_work, fields: config.metadata_fields)
216
+ def self.default_fields(fields: config.metadata_fields)
128
217
  fields.map do |field|
129
218
  Field.new(
130
219
  name: field.first,
131
- label: Hyrax::Renderers::AttributeRenderer.new(field, nil).label,
220
+ label: Hyrax::Renderers::AttributeRenderer.new(field.first, nil).label,
132
221
  options: field.last
133
222
  )
134
223
  end
135
224
  end
225
+
226
+ ##
227
+ # @param fields [Array<IiifPrint::Field>]
228
+ def self.fields_for_allinson_flex(fields: allinson_flex_fields, sort_order: IiifPrint.config.iiif_metadata_field_presentation_order)
229
+ fields = sort_af_fields!(fields, sort_order: sort_order)
230
+ fields.each_with_object({}) do |field, hash|
231
+ # filters out admin_only fields
232
+ next if field.indexing&.include?('admin_only')
233
+
234
+ # WARNING: This is assuming A LOT
235
+ # This is taking the Allinson Flex fields that have the same name and only
236
+ # using the first one while discarding the rest. There currently no way to
237
+ # controller which one(s) are discarded but this fits for the moment.
238
+ next if hash.key?(field.name)
239
+
240
+ # currently only supports the faceted option
241
+ # Why the `render_as:`? This was originally derived from Hyku default attributes
242
+ # @see https://github.com/samvera/hyku/blob/c702844de4c003eaa88eb5a7514c7a1eae1b289e/app/views/hyrax/base/_attribute_rows.html.erb#L3
243
+ hash[field.name] = Field.new(
244
+ name: field.name,
245
+ label: field.value,
246
+ options: field.indexing&.include?('facetable') ? { render_as: :faceted } : nil
247
+ )
248
+ end.values
249
+ end
250
+
251
+ CollectionFieldShim = Struct.new(:name, :value, :indexing, keyword_init: true)
252
+
253
+ ##
254
+ # @return [Array<IiifPrint::Field>]
255
+ def self.allinson_flex_fields
256
+ return @allinson_flex_fields if defined?(@allinson_flex_fields)
257
+
258
+ allinson_flex_relation = AllinsonFlex::ProfileProperty
259
+ .joins(:texts)
260
+ .where(allinson_flex_profile_texts: { name: 'display_label' })
261
+ .distinct
262
+ .select(:name, :value, :indexing)
263
+ flex_fields = allinson_flex_relation.to_a
264
+ unless allinson_flex_relation.exists?(name: 'collection')
265
+ collection_field = CollectionFieldShim.new(name: :collection, value: 'Collection', indexing: [])
266
+ flex_fields << collection_field
267
+ end
268
+ @allinson_flex_fields = flex_fields
269
+ end
270
+
271
+ ##
272
+ # @param fields [Array<IiifPrint::Field>]
273
+ # @param sort_order [Array<Symbol>]
274
+ def self.sort_af_fields!(fields, sort_order:)
275
+ return fields if sort_order.blank?
276
+
277
+ fields.sort_by do |field|
278
+ sort_order_index = sort_order.index(field.name.to_sym)
279
+ sort_order_index.nil? ? sort_order.length : sort_order_index
280
+ end
281
+ end
282
+
283
+ ##
284
+ # @api public
285
+ #
286
+ # @param work [ActiveFedora::Base]
287
+ # @param file_set [FileSet]
288
+ # @param locations [Array<String>]
289
+ # @param user [User]
290
+ #
291
+ # @return [Symbol] when none of the locations are to be split.
292
+ def self.conditionally_submit_split_for(work:, file_set:, locations:, user:, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
293
+ locations = locations.select { |location| split_for_path_suffix?(location, skip_these_endings: skip_these_endings) }
294
+ return :no_pdfs_for_splitting if locations.empty?
295
+
296
+ # Hyrax::FileSet ids are Valkyrie::ID's which can't be passed, so we call id on that and get the string id
297
+ file_set_id = file_set.id.try(:id) || file_set.id
298
+ work_admin_set_id = work.admin_set_id.try(:id) || work.admin_set_id
299
+
300
+ work.try(:iiif_print_config)&.pdf_splitter_job&.perform_later(
301
+ file_set_id,
302
+ locations,
303
+ user,
304
+ work_admin_set_id,
305
+ 0 # A no longer used parameter; but we need to preserve the method signature (for now)
306
+ )
307
+ end
308
+
309
+ ##
310
+ # @api public
311
+ #
312
+ # @param path [String] the path, hopefully with an extension, to the file we're considering
313
+ # splitting.
314
+ # @param skip_these_endings [Array<#downcase>] the endings that we should skip for splitting
315
+ # purposes.
316
+ # @return [TrueClass] when the path is one we should split
317
+ # @return [FalseClass] when the path is one we should not split
318
+ #
319
+ # @see .skip_splitting_pdf_files_that_end_with_these_texts
320
+ def self.split_for_path_suffix?(path, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
321
+ return false unless path.downcase.end_with?('.pdf')
322
+ return true if skip_these_endings.empty?
323
+ !path.downcase.end_with?(*skip_these_endings.map(&:downcase))
324
+ end
136
325
  end
326
+ # rubocop:enable Metrics/ModuleLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Samvera
4
+ module Derivatives
5
+ ##
6
+ # The purpose of this class is to contain the explicit derivative generation directives for the
7
+ # upstream application.
8
+ #
9
+ # @note The implicit deriviate types for Hyrax are as follows:
10
+ # - type :extracted_text with sources [:pdf, :office_document]
11
+ # - type :thumbnail with sources [:pdf, :office_document, :thumbnail, :image]
12
+ # - type :mp3 with sources [:audio]
13
+ # - type :ogg with sources [:audio]
14
+ # - type :webm with sources [:video]
15
+ # - type :mp4 with sources [:video]
16
+ #
17
+ # @note A long-standing practice of Samvera's Hyrax has been to have assumptive and implicit
18
+ # derivative generation (see Hyrax::FileSetDerivativesService). In being implicit, a
19
+ # challenge arises, namely overriding and configuring. There exists a crease in the code
20
+ # to allow for a different derivative approach (see Hyrax::DerivativeService). Yet that
21
+ # approach continues the tradition of implicit work.
22
+ class Configuration
23
+ def initialize
24
+ # Favoring a Hash for ease of lookup as well as the concept that there can be only one entry
25
+ # per type.
26
+ @registered_types = {}
27
+ end
28
+
29
+ # TODO: Consider the appropriate extension
30
+ RegisteredType = Struct.new(:type, :locators, :applicators, :applicability, keyword_init: true) do
31
+ def applicable_for?(file_set:)
32
+ applicability.call(file_set)
33
+ end
34
+ end
35
+
36
+ ##
37
+ # @api pulic
38
+ #
39
+ # @param type [Symbol] The named type of derivative
40
+ # @param locators [Array<Samvera::Derivatives::FileLocator::Strategy>] The strategies that
41
+ # we'll attempt in finding the derivative that we will later apply.
42
+ # @param applicators [Array<Samvera::Derivatives::FileApplicator::Strategy>] The strategies
43
+ # that we'll use to apply the found derivative to the {FileSet}
44
+ #
45
+ # @yieldparam applicability [#call]
46
+ #
47
+ # @return [RegisteredType]
48
+ #
49
+ # @note What is the best mechanism for naming the sources? At present we're doing a lot of
50
+ # assumption on the types.
51
+ def register(type:, locators:, applicators:, &applicability)
52
+ # Should the validator be required?
53
+ @registered_types[type.to_sym] = RegisteredType.new(
54
+ type: type.to_sym,
55
+ locators: Array(locators),
56
+ applicators: Array(applicators),
57
+ applicability: applicability || default_applicability
58
+ )
59
+ end
60
+
61
+ ##
62
+ # @api public
63
+ #
64
+ # @param type [Symbol]
65
+ #
66
+ # @return [RegisteredType]
67
+ def registry_for(type:)
68
+ @registered_types.fetch(type.to_sym) { empty_registry_for(type: type.to_sym) }
69
+ end
70
+
71
+ private
72
+
73
+ def empty_registry_for(type:)
74
+ RegisteredType.new(type: type, locators: [], applicators: [], applicability: ->(_file_set) { false })
75
+ end
76
+
77
+ # We're going to assume this is true unless configured otherwise.
78
+ def default_applicability
79
+ ->(_file_set) { true }
80
+ end
81
+ end
82
+ end
83
+ end