iiif_print 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +98 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
  19. data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
  20. data/app/models/concerns/iiif_print/solr/document.rb +14 -0
  21. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  22. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  23. data/app/models/iiif_print/pending_relationship.rb +3 -0
  24. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  25. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  26. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
  27. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  28. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  29. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  30. data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
  31. data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
  32. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  33. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  34. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  35. data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
  36. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  37. data/config/locales/iiif_print.en.yml +4 -0
  38. data/config/routes.rb +3 -0
  39. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  40. data/docker-compose.yml +2 -2
  41. data/iiif_print.gemspec +10 -9
  42. data/lib/generators/iiif_print/install_generator.rb +21 -1
  43. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  44. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  45. data/lib/iiif_print/base_derivative_service.rb +2 -1
  46. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
  47. data/lib/iiif_print/catalog_search_builder.rb +5 -1
  48. data/lib/iiif_print/configuration.rb +145 -8
  49. data/lib/iiif_print/data/fileset_helper.rb +1 -1
  50. data/lib/iiif_print/data/work_derivatives.rb +3 -3
  51. data/lib/iiif_print/engine.rb +7 -13
  52. data/lib/iiif_print/errors.rb +18 -0
  53. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  54. data/lib/iiif_print/image_tool.rb +12 -8
  55. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
  56. data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
  57. data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  58. data/lib/iiif_print/lineage_service.rb +29 -8
  59. data/lib/iiif_print/metadata.rb +67 -48
  60. data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
  61. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
  62. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  63. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
  64. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  65. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  66. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  67. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  68. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  69. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  70. data/lib/iiif_print/version.rb +1 -1
  71. data/lib/iiif_print.rb +167 -12
  72. data/lib/samvera/derivatives/configuration.rb +83 -0
  73. data/lib/samvera/derivatives/hyrax.rb +129 -0
  74. data/lib/samvera/derivatives.rb +238 -0
  75. data/spec/factories/newspaper_page_solr_document.rb +9 -1
  76. data/spec/fixtures/authorities/licenses.yml +4 -0
  77. data/spec/fixtures/authorities/rights_statements.yml +4 -0
  78. data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
  79. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
  80. data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
  81. data/spec/iiif_print/configuration_spec.rb +141 -15
  82. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
  83. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
  84. data/spec/iiif_print/lineage_service_spec.rb +1 -1
  85. data/spec/iiif_print/metadata_spec.rb +157 -23
  86. data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
  87. data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
  88. data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
  89. data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
  90. data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
  91. data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
  92. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
  93. data/spec/iiif_print_spec.rb +125 -5
  94. data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
  95. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
  96. data/spec/samvera/derivatives/configuration_spec.rb +41 -0
  97. data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
  98. data/spec/samvera/derivatives_spec.rb +54 -0
  99. data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
  100. data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
  101. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
  102. data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
  103. data/tasks/copy_authorities_to_test_app.rake +11 -0
  104. data/tasks/iiif_print_dev.rake +4 -4
  105. metadata +123 -35
  106. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  107. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  108. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -1,41 +1,67 @@
1
1
  module IiifPrint
2
2
  module Jobs
3
- # Break a pdf into individual pages
3
+ # Link newly created child works to the parent
4
4
  class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
5
- # Link newly created child works to the parent
6
- # @param user: [User] user
5
+ include Hyrax::Lockable
6
+
7
+ RETRY_MAX = 10
8
+
7
9
  # @param parent_id: [<String>] parent work id
8
10
  # @param parent_model: [<String>] parent model
9
11
  # @param child_model: [<String>] child model
10
- def perform(user:, parent_id:, parent_model:, child_model:)
11
- if completed_child_data_for(parent_id, child_model)
12
+ # @param retries: [<Integer>] count used during rescheduling to prevent infinite retries
13
+ def perform(parent_id:, parent_model:, child_model:, retries: 0, **)
14
+ @parent_id = parent_id
15
+ @parent_model = parent_model
16
+ @child_model = child_model
17
+ @retries = retries + 1
18
+
19
+ @number_of_successes = 0
20
+ @number_of_failures = 0
21
+ @parent_record_members_added = false
22
+ @errors = []
23
+
24
+ # Because we need our children in the correct order, we can't create any
25
+ # relationships until all child works have been created.
26
+ if completed_child_data
12
27
  # add the members
13
- parent_work = parent_model.constantize.find(parent_id)
14
- create_relationships(user: user, parent: parent_work, ordered_children: @child_works)
15
- @pending_children.each(&:destroy)
28
+ add_children_to_parent
29
+ if @number_of_failures.zero? && @number_of_successes == @pending_children.count
30
+ # remove pending relationships upon valid completion
31
+ @pending_children.each(&:destroy)
32
+ elsif @number_of_failures.zero? && @number_of_successes > @pending_children.count
33
+ # remove pending relationships but raise error that too many relationships formed
34
+ @pending_children.each(&:destroy)
35
+ raise "CreateRelationshipsJob for parent id: #{@parent_id} " \
36
+ "added #{@number_of_successes} children, " \
37
+ "expected #{@pending_children} children."
38
+ else
39
+ # report failures & keep pending relationships
40
+ raise "CreateRelationshipsJob failed for parent id: #{@parent_id} " \
41
+ "had #{@number_of_successes} successes & #{@number_of_failures} failures, " \
42
+ "with errors: #{@errors}. Wanted #{@pending_children} children."
43
+ end
16
44
  else
17
- # reschedule the job and end this one normally
18
- #
19
- # TODO: Depending on how things shake out, we could be infinitely rescheduling this job.
20
- # Consider a time to live parameter.
21
- reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
45
+ # if we aren't ready yet, reschedule the job and end this one normally
46
+ reschedule_job
22
47
  end
23
48
  end
24
49
 
25
50
  private
26
51
 
27
- # load @child_works, and return true or false
28
- def completed_child_data_for(parent_id, child_model)
52
+ # load @child_works and @pending children, and
53
+ # return boolean indicating whether all chilren are present
54
+ def completed_child_data
29
55
  @child_works = []
30
56
  found_all_children = true
31
57
 
32
58
  # find and sequence all pending children
33
- @pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')
59
+ @pending_children = IiifPrint::PendingRelationship.where(parent_id: @parent_id).order('child_order asc')
34
60
 
35
61
  # find child works (skip out if any haven't yet been created)
36
62
  @pending_children.each do |child|
37
63
  # find by title... if any aren't found, the child works are not yet ready
38
- found_children = find_children_by_title_for(child.child_title, child_model)
64
+ found_children = find_children_by_title_for(child.child_title, @child_model)
39
65
  found_all_children = false if found_children.empty?
40
66
  break unless found_all_children == true
41
67
  @child_works += found_children
@@ -49,30 +75,53 @@ module IiifPrint
49
75
  model.constantize.where(title: title)
50
76
  end
51
77
 
52
- def reschedule(user:, parent_id:, parent_model:, child_model:)
78
+ def add_children_to_parent
79
+ parent_work = @parent_model.constantize.find(@parent_id)
80
+ create_relationships(parent: parent_work, ordered_children: @child_works)
81
+ end
82
+
83
+ def reschedule_job
84
+ return if @retries > RETRY_MAX
53
85
  CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
54
- user: user,
55
- parent_id: parent_id,
56
- parent_model: parent_model,
57
- child_model: child_model
86
+ parent_id: @parent_id,
87
+ parent_model: @parent_model,
88
+ child_model: @child_model,
89
+ retries: @retries
58
90
  )
59
91
  end
60
92
 
61
- def create_relationships(user:, parent:, ordered_children:)
62
- records_hash = {}
63
- ordered_children.map(&:id).each_with_index do |child_id, i|
64
- records_hash[i.to_s] = { id: child_id }
93
+ def create_relationships(parent:, ordered_children:)
94
+ acquire_lock_for(parent.id) do
95
+ # Not sure uncached is needed here, but kept
96
+ # for consistency with Bulkrax's relationships logic
97
+ ActiveRecord::Base.uncached do
98
+ ordered_children.each do |child|
99
+ add_to_work(child_record: child, parent_record: parent)
100
+ @number_of_successes += 1
101
+ rescue => e
102
+ @number_of_failures += 1
103
+ @errors << e
104
+ end
105
+ end
106
+ parent.save! if @parent_record_members_added && @number_of_failures.zero?
65
107
  end
66
- attrs = { work_members_attributes: records_hash }
67
- parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
68
- env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)
69
108
 
70
- Hyrax::CurationConcern.actor.update(env)
71
- # need to reindex all file_sets to make all ancestors are indexed
109
+ # Bulkrax no longer reindexes file_sets, but IiifPrint needs both to add is_page_of_ssim for universal viewer.
110
+ # This is where child works need to be indexed (AFTER the parent save), as opposed to how Bulkrax does it.
72
111
  ordered_children.each do |child_work|
112
+ child_work.update_index
73
113
  child_work.file_sets.each(&:update_index) if child_work.respond_to?(:file_sets)
74
114
  end
75
115
  end
116
+
117
+ def add_to_work(child_record:, parent_record:)
118
+ return true if parent_record.ordered_members.to_a.include?(child_record)
119
+
120
+ parent_record.ordered_members << child_record
121
+ @parent_record_members_added = true
122
+ # Bulkrax does child_record.save! here, but it makes no sense
123
+ # as there is nothing to save or index at this point.
124
+ end
76
125
  end
77
126
  end
78
127
  end
@@ -0,0 +1,31 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ ##
4
+ # Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
5
+ class RequestSplitPdfJob < IiifPrint::Jobs::ApplicationJob
6
+ ##
7
+ # @param file_set [FileSet]
8
+ # @param user [User]
9
+ # rubocop:disable Metrics/MethodLength
10
+ def perform(file_set:, user:)
11
+ return true unless file_set.pdf?
12
+
13
+ work = IiifPrint.parent_for(file_set)
14
+
15
+ # Woe is ye who changes the configuration of the model, thus removing the splitting.
16
+ raise WorkNotConfiguredToSplitFileSetError.new(work: work, file_set: file_set) unless work&.iiif_print_config&.pdf_splitter_job&.presence
17
+
18
+ # clean up any existing spawned child works of this file_set
19
+ IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
20
+ file_set: file_set,
21
+ work: work
22
+ )
23
+
24
+ location = Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id)
25
+
26
+ IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: [location], user: user)
27
+ end
28
+ # rubocop:enable Metrics/MethodLength
29
+ end
30
+ end
31
+ end
@@ -2,7 +2,7 @@ module IiifPrint
2
2
  # The purpose of this module is to encode lineage related services:
3
3
  #
4
4
  # - {.ancestor_ids_for}
5
- # - {.descendent_file_set_ids_for}
5
+ # - {.descendent_member_ids_for}
6
6
  #
7
7
  # The ancestor and descendent_file_sets are useful for ensuring we index together related items.
8
8
  # For example, when I have a work that is a book, and one file set per page of that book, when I
@@ -18,24 +18,45 @@ module IiifPrint
18
18
  def self.ancestor_ids_for(object)
19
19
  ancestor_ids ||= []
20
20
  object.in_works.each do |work|
21
- ancestor_ids << work.id
21
+ ancestor_ids << ancestry_identifier_for(work)
22
22
  ancestor_ids += ancestor_ids_for(work) if work.is_child
23
23
  end
24
24
  ancestor_ids.flatten.compact.uniq
25
25
  end
26
26
 
27
+ ##
28
+ # @api public
29
+ #
30
+ # Given the :work return it's identifier
31
+ #
32
+ # @param [Object]
33
+ # @return [String]
34
+ def self.ancestry_identifier_for(work)
35
+ IiifPrint.config.ancestory_identifier_function.call(work)
36
+ end
37
+
27
38
  ##
28
39
  # @param object [#ordered_works, #file_sets, #member_ids]
29
- # @return [Array<String>] the ids of associated file sets
30
- def self.descendent_file_set_ids_for(object)
40
+ # @return [Array<String>] the ids of associated file sets and child works
41
+ #
42
+ # @see
43
+ # https://github.com/samvera/hyrax/blob/2b807fe101176d594129ef8a8fe466d3d03a372b/app/indexers/hyrax/work_indexer.rb#L15-L18
44
+ # for "clarification" of the comingling of file_set_ids and member_ids
45
+ def self.descendent_member_ids_for(object)
31
46
  # enables us to return parents when searching for child OCR
32
- file_set_ids = object.file_sets.map(&:id)
47
+ #
48
+ # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/models/concerns/work_behavior.rb#L90-L92
49
+ #
50
+ # The Hydara::Works implementation of file_set_ids is "members.select(&:file_set?).map(&:id)";
51
+ # so no sense doing `object.file_set_ids + object.member_ids`
52
+ file_set_ids = object.member_ids
33
53
  object.ordered_works&.each do |child|
34
- file_set_ids += descendent_file_set_ids_for(child)
54
+ file_set_ids += descendent_member_ids_for(child)
35
55
  end
36
- # enables us to return parents when searching for child metadata
37
- file_set_ids += object.member_ids
38
56
  file_set_ids.flatten.uniq.compact
39
57
  end
58
+ class << self
59
+ alias descendent_file_set_ids_for descendent_member_ids_for
60
+ end
40
61
  end
41
62
  end
@@ -17,48 +17,43 @@ module IiifPrint
17
17
  @base_url = base_url
18
18
  end
19
19
 
20
- attr_reader :work, :version, :fields
20
+ attr_reader :work, :version, :fields, :current_ability
21
21
 
22
22
  def build_metadata
23
- send("build_metadata_for_v#{version}")
24
- end
25
-
26
- private
27
-
28
- def build_metadata_for_v2
29
23
  fields.map do |field|
30
- label = Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label
31
- if field.name == :collection && member_of_collection?
32
- viewable_collections = Hyrax::CollectionMemberService.run(work, @current_ability)
33
- next if viewable_collections.empty?
34
- { 'label' => label,
35
- 'value' => make_collection_link(viewable_collections) }
36
- else
37
- next if field_is_empty?(field)
38
- { 'label' => label,
39
- 'value' => cast_to_value(field_name: field.name, options: field.options) }
24
+ values = values_for(field_name: field)
25
+ if field.name == :collection && member_of_collection? && viewable_collections.present?
26
+ { 'label' => metadata_map(field, :label),
27
+ 'value' => metadata_map(field, :collection) }
28
+ elsif values.present? && !empty_string?(values)
29
+ { 'label' => metadata_map(field, :label),
30
+ 'value' => metadata_map(field, :value) }
40
31
  end
41
32
  end.compact
42
33
  end
43
34
 
44
- def build_metadata_for_v3
45
- fields.map do |field|
46
- values = Array(work.try(field.name)).map { |value| scrub(value.to_s) }
47
- next if values.empty?
48
- {
49
- 'label' => {
50
- # Since we're using I18n to translate the field, we're setting the locale used in the translation.
51
- I18n.locale.to_s => [Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label]
52
- },
53
- 'value' => {
54
- 'none' => values
55
- }
56
- }
57
- end.compact
35
+ private
36
+
37
+ def metadata_map(field, property)
38
+ if version == 2
39
+ case property
40
+ when :label then field.label
41
+ when :value then cast_to_value(field_name: field.name, options: field.options)
42
+ when :collection then make_collection_link(viewable_collections)
43
+ end
44
+ elsif version == 3
45
+ case property
46
+ when :label then { I18n.locale.to_s => [field.label] }
47
+ when :value then { 'none' => cast_to_value(field_name: field.name, options: field.options) }
48
+ when :collection then { 'none' => make_collection_link(viewable_collections) }
49
+ end
50
+ end
58
51
  end
59
52
 
60
- def field_is_empty?(field)
61
- Array(work.try(field.name)).empty?
53
+ # Bulkrax imports values as [""] if there isn't a value but still a header,
54
+ # these fields should not show in the metadata pane
55
+ def empty_string?(values)
56
+ values.uniq.size == 1 ? values.first == "" : false
62
57
  end
63
58
 
64
59
  def member_of_collection?
@@ -71,21 +66,41 @@ module IiifPrint
71
66
 
72
67
  def cast_to_value(field_name:, options:)
73
68
  if options&.[](:render_as) == :faceted
74
- values_for(field_name: field_name).map do |value|
75
- search_field = field_name.to_s + "_sim"
76
- path = Rails.application.routes.url_helpers.search_catalog_path(
77
- "f[#{search_field}][]": value, locale: I18n.locale
78
- )
79
- path += '&include_child_works=true' if work["is_child_bsi"] == true
80
- "<a href='#{File.join(@base_url, path)}'>#{value}</a>"
81
- end
69
+ faceted_values_for(field_name: field_name)
70
+ elsif qa_field?(field_name: options&.dig(:render_as) || field_name)
71
+ authority_values_for(field_name: field_name)
82
72
  else
83
73
  make_link(values_for(field_name: field_name))
84
74
  end
85
75
  end
86
76
 
77
+ def faceted_values_for(field_name:)
78
+ values_for(field_name: field_name).map do |value|
79
+ search_field = field_name.to_s + "_sim"
80
+ path = Rails.application.routes.url_helpers.search_catalog_path(
81
+ "f[#{search_field}][]": value, locale: I18n.locale
82
+ )
83
+ path += '&include_child_works=true' if work["is_child_bsi"] == true
84
+ "<a href='#{File.join(@base_url, path)}'>#{value}</a>"
85
+ end
86
+ end
87
+
88
+ def qa_field?(field_name:, questioning_authority_fields: IiifPrint.config.questioning_authority_fields)
89
+ questioning_authority_fields.include?(field_name.to_s)
90
+ end
91
+
92
+ def authority_values_for(field_name:)
93
+ authority = Qa::Authorities::Local.subauthority_for(field_name.to_s.pluralize)
94
+ values_for(field_name: field_name).map do |value|
95
+ id, term = authority.find(value).values_at('id', 'term')
96
+ "<a href='#{id}'>#{term}</a>"
97
+ end
98
+ end
99
+
87
100
  def values_for(field_name:)
88
- Array(work.send(field_name))
101
+ field_name = field_name.try(:name) || field_name
102
+ # TODO: we are assuming tesim or dtsi (for dates), might want to account for other suffixes in the future
103
+ Array(work["#{field_name}_tesim"] || work["#{field_name}_dtsi"]&.to_date.try(:to_formatted_s, :standard))
89
104
  end
90
105
 
91
106
  def make_collection_link(collection_documents)
@@ -94,11 +109,16 @@ module IiifPrint
94
109
  end
95
110
  end
96
111
 
97
- # @note This method turns link looking strings into links
112
+ def viewable_collections
113
+ Hyrax::CollectionMemberService.run(SolrDocument.find(work.id), current_ability)
114
+ end
115
+
116
+ # @note This method turns link looking strings into links and assumes https if not protocol was given
98
117
  def make_link(texts)
99
118
  texts.map do |t|
100
119
  t.to_s.gsub(MAKE_LINK_REGEX) do |url|
101
- "<a href='#{url}' target='_blank'>#{url}</a>"
120
+ protocol = url.start_with?('www.') ? 'https://' : ''
121
+ "<a href='#{protocol}#{url}' target='_blank'>#{url}</a>"
102
122
  end
103
123
  end
104
124
  end
@@ -106,10 +126,9 @@ module IiifPrint
106
126
  MAKE_LINK_REGEX = %r{
107
127
  \b
108
128
  (
109
- (?: [a-z][\w-]+:
110
- (?: /{1,3} | [a-z0-9%] ) |
111
- www\d{0,3}[.] |
112
- [a-z0-9.\-]+[.][a-z]{2,4}/
129
+ (?:
130
+ (?:https?://) |
131
+ (?:www\.)
113
132
  )
114
133
  (?:
115
134
  [^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
@@ -0,0 +1,142 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+ require 'iiif_print/split_pdfs/pdf_image_extraction_service'
5
+
6
+ module IiifPrint
7
+ module SplitPdfs
8
+ # @abstract
9
+ #
10
+ # The purpose of this class is to split the PDF into constituent image files.
11
+ #
12
+ # @see .call
13
+ class BaseSplitter
14
+ ##
15
+ # @api public
16
+ #
17
+ # @param path [String] local path to the PDF that we will split.
18
+ # @return [Enumerable]
19
+ #
20
+ # @see #each
21
+ #
22
+ # @note We're including the ** args to provide method conformity; other services require
23
+ # additional information (such as the FileSet)
24
+ #
25
+ # @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter
26
+ def self.call(path, **)
27
+ new(path).to_a
28
+ end
29
+
30
+ class_attribute :image_extension
31
+ class_attribute :compression, default: nil
32
+ class_attribute :quality, default: nil
33
+
34
+ def initialize(path, tmpdir: Dir.mktmpdir, default_dpi: 400)
35
+ @baseid = SecureRandom.uuid
36
+ @pdfpath = path
37
+ @pdfinfo = IiifPrint::SplitPdfs::PdfImageExtractionService.new(pdfpath)
38
+ @tmpdir = tmpdir
39
+ @default_dpi = default_dpi
40
+ end
41
+
42
+ # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
43
+ include Enumerable
44
+
45
+ # @api public
46
+ #
47
+ # @yieldparam [String] the path to the page's tiff.
48
+ def each
49
+ entries.each do |e|
50
+ yield(e)
51
+ end
52
+ end
53
+
54
+ # @api private
55
+ #
56
+ # TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
57
+ def invalid_pdf?
58
+ return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.page_count.zero?
59
+ false
60
+ end
61
+
62
+ attr_reader :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
63
+ private :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
64
+
65
+ private
66
+
67
+ # entries for each page
68
+ def entries
69
+ return @entries if defined? @entries
70
+
71
+ @entries = Array.wrap(gsconvert)
72
+ end
73
+
74
+ # rubocop:disable Metrics/MethodLength
75
+ # ghostscript convert all pages to TIFF
76
+ def gsconvert
77
+ output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
78
+ # NOTE: you must call gsdevice before compression, as compression is
79
+ # updated during the gsdevice call.
80
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
81
+ cmd += " -sCompression=#{compression}" if compression?
82
+ cmd += " -dJPEGQ=#{quality}" if quality?
83
+ cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
84
+ filenames = []
85
+
86
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
87
+ page_number = 0
88
+ stdout.read.split("\n").each do |line|
89
+ next unless line.start_with?('Page ')
90
+
91
+ page_number += 1
92
+ filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}")
93
+ end
94
+ end
95
+
96
+ filenames
97
+ end
98
+ # rubocop:enable Metrics/MethodLength
99
+
100
+ def gsdevice
101
+ raise NotImplementedError
102
+ end
103
+
104
+ PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze
105
+
106
+ def pagecount
107
+ return @pagecount if defined? @pagecount
108
+
109
+ cmd = "pdfinfo #{pdfpath}"
110
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
111
+ match = PAGE_COUNT_REGEXP.match(stdout.read)
112
+ @pagecount = match[1].to_i
113
+ end
114
+ @pagecount
115
+ end
116
+
117
+ def ppi
118
+ if looks_scanned?
119
+ # For scanned media, defer to detected image PPI:
120
+ pdfinfo.ppi
121
+ else
122
+ # 400 dpi for something that does not look like scanned media:
123
+ default_dpi
124
+ end
125
+ end
126
+
127
+ def looks_scanned?
128
+ max_image_px = pdfinfo.width * pdfinfo.height
129
+ # single 10mp+ image per page?
130
+ single_image_per_page? && max_image_px > 1024 * 1024 * 10
131
+ end
132
+
133
+ def single_image_per_page?
134
+ pdfinfo.page_count == pagecount
135
+ end
136
+ end
137
+ end
138
+ end
139
+
140
+ require "iiif_print/split_pdfs/pages_to_jpgs_splitter"
141
+ require "iiif_print/split_pdfs/pages_to_pngs_splitter"
142
+ require "iiif_print/split_pdfs/pages_to_tiffs_splitter"