iiif_print 1.1.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/Gemfile.lock +2 -2
  4. data/README.md +4 -0
  5. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +1 -1
  6. data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
  7. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +37 -22
  8. data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
  9. data/{lib → app/jobs}/iiif_print/jobs/child_works_from_pdf_job.rb +14 -9
  10. data/{lib → app/jobs}/iiif_print/jobs/create_relationships_job.rb +10 -20
  11. data/app/listeners/iiif_print/listener.rb +31 -0
  12. data/app/models/concerns/iiif_print/set_child_flag.rb +1 -1
  13. data/app/models/concerns/iiif_print/solr/document.rb +5 -3
  14. data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
  15. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  16. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +5 -2
  17. data/app/services/iiif_print/manifest_builder_service_behavior.rb +4 -2
  18. data/app/services/iiif_print/pluggable_derivative_service.rb +5 -1
  19. data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
  20. data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
  21. data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
  22. data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
  23. data/app/views/hyrax/file_sets/_show_actions.html.erb +1 -1
  24. data/config/initializers/simple_schema_loader.rb +1 -0
  25. data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
  26. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
  27. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
  28. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
  29. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +3 -3
  30. data/iiif_print.gemspec +1 -1
  31. data/lib/iiif_print/base_derivative_service.rb +13 -2
  32. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +2 -2
  33. data/lib/iiif_print/catalog_search_builder.rb +2 -2
  34. data/lib/iiif_print/configuration.rb +65 -5
  35. data/lib/iiif_print/data/fileset_helper.rb +2 -2
  36. data/lib/iiif_print/data/work_derivatives.rb +1 -1
  37. data/lib/iiif_print/engine.rb +46 -2
  38. data/lib/iiif_print/homepage_search_builder.rb +2 -2
  39. data/lib/iiif_print/jp2_derivative_service.rb +4 -1
  40. data/lib/iiif_print/lineage_service.rb +19 -6
  41. data/lib/iiif_print/pdf_derivative_service.rb +3 -1
  42. data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
  43. data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
  44. data/lib/iiif_print/persistence_layer.rb +118 -0
  45. data/lib/iiif_print/split_pdfs/base_splitter.rb +11 -0
  46. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +19 -9
  47. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +5 -16
  48. data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
  49. data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
  50. data/lib/iiif_print/tiff_derivative_service.rb +3 -1
  51. data/lib/iiif_print/version.rb +1 -1
  52. data/lib/iiif_print.rb +79 -44
  53. metadata +18 -191
  54. data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -40
  55. data/app/views/hyrax/file_sets/_actions.html.erb +0 -46
  56. data/bin/rails +0 -13
  57. data/spec/.keep.txt +0 -1
  58. data/spec/factories/ability.rb +0 -6
  59. data/spec/factories/newspaper_issue.rb +0 -7
  60. data/spec/factories/newspaper_page.rb +0 -7
  61. data/spec/factories/newspaper_page_solr_document.rb +0 -20
  62. data/spec/factories/newspaper_title.rb +0 -8
  63. data/spec/factories/uploaded_pdf_file.rb +0 -9
  64. data/spec/factories/uploaded_txt_file.rb +0 -9
  65. data/spec/factories/user.rb +0 -13
  66. data/spec/fixtures/authorities/licenses.yml +0 -4
  67. data/spec/fixtures/authorities/rights_statements.yml +0 -4
  68. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  69. data/spec/fixtures/files/4.1.07.tiff +0 -0
  70. data/spec/fixtures/files/README.md +0 -7
  71. data/spec/fixtures/files/alto-2-0.xsd +0 -714
  72. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  73. data/spec/fixtures/files/credits.md +0 -16
  74. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  75. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  76. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  77. data/spec/fixtures/files/minimal-alto.xml +0 -31
  78. data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
  79. data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
  80. data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
  81. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  82. data/spec/fixtures/files/ocr_alto.xml +0 -202
  83. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
  84. data/spec/fixtures/files/ocr_color.tiff +0 -0
  85. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  86. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  87. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  88. data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
  89. data/spec/fixtures/files/page1.tiff +0 -0
  90. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  91. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  92. data/spec/fixtures/files/thumbnail.jpg +0 -0
  93. data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
  94. data/spec/helpers/iiif_print_helper_spec.rb +0 -43
  95. data/spec/iiif_print/base_derivative_service_spec.rb +0 -28
  96. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -59
  97. data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
  98. data/spec/iiif_print/configuration_spec.rb +0 -193
  99. data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
  100. data/spec/iiif_print/data/work_file_spec.rb +0 -99
  101. data/spec/iiif_print/data/work_files_spec.rb +0 -237
  102. data/spec/iiif_print/image_tool_spec.rb +0 -109
  103. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -35
  104. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -118
  105. data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
  106. data/spec/iiif_print/lineage_service_spec.rb +0 -13
  107. data/spec/iiif_print/metadata_spec.rb +0 -249
  108. data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +0 -27
  109. data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +0 -80
  110. data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +0 -92
  111. data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +0 -22
  112. data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +0 -18
  113. data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +0 -19
  114. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
  115. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
  116. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
  117. data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
  118. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
  119. data/spec/iiif_print_spec.rb +0 -171
  120. data/spec/misc_shared.rb +0 -111
  121. data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
  122. data/spec/models/iiif_print/iiif_search_decorator_spec.rb +0 -27
  123. data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
  124. data/spec/models/solr_document_spec.rb +0 -14
  125. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -70
  126. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
  127. data/spec/samvera/derivatives/configuration_spec.rb +0 -41
  128. data/spec/samvera/derivatives/hyrax_spec.rb +0 -62
  129. data/spec/samvera/derivatives_spec.rb +0 -54
  130. data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +0 -103
  131. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
  132. data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +0 -20
  133. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
  134. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -175
  135. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
  136. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
  137. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
  138. data/spec/spec_helper.rb +0 -181
  139. data/spec/support/controller_level_helpers.rb +0 -28
  140. data/spec/support/iiif_print_models.rb +0 -127
  141. data/spec/test_app_templates/blacklight.yml +0 -9
  142. data/spec/test_app_templates/fedora.yml +0 -15
  143. data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
  144. data/spec/test_app_templates/redis.yml +0 -9
  145. data/spec/test_app_templates/solr/conf/schema.xml +0 -362
  146. data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
  147. data/spec/test_app_templates/solr.yml +0 -7
  148. /data/{lib → app/jobs}/iiif_print/jobs/request_split_pdf_job.rb +0 -0
@@ -0,0 +1,183 @@
1
+ module IiifPrint
2
+ module PersistenceLayer
3
+ class ValkyrieAdapter < AbstractAdapter
4
+ ##
5
+ # @param object [Valkyrie::Resource]
6
+ # @return [Array<Valkyrie::Resource>]
7
+ def self.object_in_works(object)
8
+ Array.wrap(Hyrax.custom_queries.find_parent_work(resource: object))
9
+ end
10
+
11
+ ##
12
+ # @param object [Valkyrie::Resource]
13
+ # @return [Array<Valkyrie::Resource>]
14
+ def self.object_ordered_works(object)
15
+ child_file_sets = Hyrax.custom_queries.find_child_file_sets(resource: object).to_a
16
+ child_works = Hyrax.custom_queries.find_child_works(resource: object).to_a
17
+ child_works + child_file_sets
18
+ end
19
+
20
+ ##
21
+ # @param work_type [Class<Valkyrie::Resource>]
22
+ # @return the indexer for the given :work_type
23
+ def self.decorate_with_adapter_logic(work_type:)
24
+ work_type.send(:include, Hyrax::Schema(:child_works_from_pdf_splitting)) unless work_type.included_modules.include?(Hyrax::Schema(:child_works_from_pdf_splitting))
25
+ # TODO: Use `Hyrax::ValkyrieIndexer.indexer_class_for` once changes are merged.
26
+ indexer = "#{work_type}Indexer".constantize
27
+ indexer.send(:include, Hyrax::Indexer(:child_works_from_pdf_splitting)) unless indexer.included_modules.include?(Hyrax::Indexer(:child_works_from_pdf_splitting))
28
+ indexer
29
+ end
30
+
31
+ ##
32
+ # @param work_type [Class<ActiveFedora::Base>]
33
+ # @return form for the given :work_type
34
+ def self.decorate_form_with_adapter_logic(work_type:)
35
+ form = "#{work_type}Form".constantize
36
+ form.send(:include, Hyrax::FormFields(:child_works_from_pdf_splitting)) unless form.included_modules.include?(Hyrax::FormFields(:child_works_from_pdf_splitting))
37
+ form
38
+ end
39
+
40
+ ##
41
+ # Return the immediate parent of the given :file_set.
42
+ #
43
+ # @param file_set [Hyrax::FileMetadata or FileSet]
44
+ # @return [#work?, Hydra::PCDM::Work]
45
+ # @return [NilClass] when no parent is found.
46
+ def self.parent_for(file_set)
47
+ file_set = Hyrax.query_service.find_by(id: file_set.file_set_id) if file_set.is_a?(Hyrax::FileMetadata)
48
+ Hyrax.query_service.find_parents(resource: file_set).first
49
+ end
50
+
51
+ ##
52
+ # Return the parent's parent of the given :file_set.
53
+ #
54
+ # @param file_set [Hyrax::FileMetadata or FileSet]
55
+ # @return [#work?, Hydra::PCDM::Work]
56
+ # @return [NilClass] when no grand parent is found.
57
+ def self.grandparent_for(file_set)
58
+ parent = parent_for(file_set)
59
+ return nil unless parent
60
+ Hyrax.query_service.find_parents(resource: parent).first
61
+ end
62
+
63
+ def self.solr_construct_query(*args)
64
+ Hyrax::SolrQueryBuilderService.construct_query(*args)
65
+ end
66
+
67
+ def self.clean_for_tests!
68
+ # For Fedora backed repositories, we'll want to consider some cleaning mechanism. For
69
+ # database backed repositories, we can rely on the database_cleaner gem.
70
+ raise NotImplementedError
71
+ end
72
+
73
+ def self.solr_query(query, **args)
74
+ Hyrax::SolrService.query(query, **args)
75
+ end
76
+
77
+ def self.solr_name(field_name)
78
+ Hyrax.config.index_field_mapper.solr_name(field_name.to_s)
79
+ end
80
+
81
+ # rubocop:disable Lint/UnusedMethodArgument
82
+ def self.destroy_children_split_from(file_set:, work:, model:, user:)
83
+ # rubocop:enable Lint/UnusedMethodArgument
84
+ # look for child records by the file set id they were split from
85
+ Hyrax.query_service.find_inverse_references_by(resource: file_set, property: :split_from_pdf_id, model: model).each do |child|
86
+ Hyrax.persister.delete(resource: child)
87
+ Hyrax.indexing_service.delete(resource: child)
88
+ Hyrax.publisher.publish('object.deleted', object: child, user: user)
89
+ end
90
+ true
91
+ end
92
+
93
+ def self.pdf?(file_set)
94
+ file_set.original_file.pdf?
95
+ end
96
+
97
+ ##
98
+ # Add a child record as a member of a parent record
99
+ #
100
+ # @param model [child_record] a Valkyrie::Resource model
101
+ # @param model [parent_record] a Valkyrie::Resource model
102
+ # @return [TrueClass]
103
+ def self.create_relationship_between(child_record:, parent_record:)
104
+ return true if parent_record.member_ids.include?(child_record.id)
105
+ parent_record.member_ids << child_record.id
106
+ true
107
+ end
108
+
109
+ ##
110
+ # find a work by title
111
+ # We should only find one, but there is no guarantee of that
112
+ # @param title [String]
113
+ # @param model [String] a Valkyrie::Resource model
114
+ # @return [Array<Valkyrie::Resource]
115
+ def self.find_by_title_for(title:, model:)
116
+ work_type = model.constantize
117
+ # TODO: This creates a hard dependency on Bulkrax because that is where this custom query is defined
118
+ # Is this adequate?
119
+ Array.wrap(Hyrax.query_service.custom_query.find_by_model_and_property_value(model: work_type,
120
+ property: :title,
121
+ value: title))
122
+ end
123
+
124
+ ##
125
+ # find a work or file_set
126
+ #
127
+ # @param id [String]
128
+ def self.find_by(id:)
129
+ Hyrax.query_service.find_by(id: id)
130
+ end
131
+
132
+ ##
133
+ # save a work
134
+ #
135
+ # @param object [Array<Valkyrie::Resource]
136
+ def self.save(object:)
137
+ Hyrax.persister.save(resource: object)
138
+ Hyrax.index_adapter.save(resource: object)
139
+
140
+ Hyrax.publisher.publish('object.membership.updated', object: object, user: object.depositor)
141
+ end
142
+
143
+ ##
144
+ # reindex an array of works and their file_sets
145
+ #
146
+ # @param objects [Array<Valkyrie::Resource]
147
+ # @return [TrueClass]
148
+ def self.index_works(objects:)
149
+ objects.each do |work|
150
+ Hyrax.index_adapter.save(resource: work)
151
+ Hyrax.custom_queries.find_child_file_sets(resource: work).each do |file_set|
152
+ Hyrax.index_adapter.save(resource: file_set)
153
+ end
154
+ end
155
+ true
156
+ end
157
+
158
+ ##
159
+ # Performs an extra step to create the Hyrax::Metadata objects
160
+ # for derivatives.
161
+ #
162
+ # @param []
163
+ # @return [TrueClass]
164
+ def self.copy_derivatives_from_data_store(stream:, directives:)
165
+ Hyrax::ValkyriePersistDerivatives.call(stream, directives)
166
+ end
167
+
168
+ ##
169
+ # Extract text from the derivatives
170
+ #
171
+ # @param [Hyrax::FileSet] a Valkyrie fileset
172
+ # @return [String] Text from fileset's file
173
+ def self.extract_text_for(file_set:)
174
+ fm = Hyrax.custom_queries.find_many_file_metadata_by_use(resource: file_set,
175
+ use: Hyrax::FileMetadata::Use.uri_for(use: :extracted_file))
176
+ return if fm.empty?
177
+ text_fm = fm.find { |t| t.mime_type == Marcel::MimeType.for(extension: 'txt') }
178
+ return if text_fm.nil?
179
+ text_fm.content
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,118 @@
1
+ module IiifPrint
2
+ ##
3
+ # The PersistenceLayer module provides the namespace for other adapters:
4
+ #
5
+ # - {IiifPrint::PersistenceLayer::ActiveFedoraAdapter}
6
+ # - {IiifPrint::PersistenceLayer::ValkyrieAdapter}
7
+ #
8
+ # And the defining interface in the {IiifPrint::PersistenceLayer::AbstractAdapter}
9
+ module PersistenceLayer
10
+ # @abstract
11
+ class AbstractAdapter
12
+ ##
13
+ # @param object [Object]
14
+ # @return [Array<Object>]
15
+ def self.object_in_works(object)
16
+ raise NotImplementedError, "#{self}.{__method__}"
17
+ end
18
+
19
+ ##
20
+ # @param object [Object]
21
+ # @return [Array<Object>]
22
+ def self.object_ordered_works(object)
23
+ raise NotImplementedError, "#{self}.{__method__}"
24
+ end
25
+
26
+ ##
27
+ # @param work_type [Class]
28
+ # @return the corresponding indexer for the work_type
29
+ def self.decorate_with_adapter_logic(work_type:)
30
+ raise NotImplementedError, "#{self}.{__method__}"
31
+ end
32
+
33
+ ##
34
+ # @param work_type [Class]
35
+ # @return the corresponding indexer for the work_type
36
+ def self.decorate_form_with_adapter_logic(work_type:)
37
+ raise NotImplementedError, "#{self}.{__method__}"
38
+ end
39
+
40
+ ##
41
+ # @param file_set [Object]
42
+ # @param work [Object]
43
+ # @param model [Class] The class name for which we'll split children.
44
+ def self.destroy_children_split_from(file_set:, work:, model:)
45
+ raise NotImplementedError, "#{self}.{__method__}"
46
+ end
47
+
48
+ ##
49
+ # @abstract
50
+ def self.parent_for(*)
51
+ raise NotImplementedError, "#{self}.{__method__}"
52
+ end
53
+
54
+ ##
55
+ # @abstract
56
+ def self.grandparent_for(*)
57
+ raise NotImplementedError, "#{self}.{__method__}"
58
+ end
59
+
60
+ ##
61
+ # @abstract
62
+ def self.solr_field_query(*)
63
+ raise NotImplementedError, "#{self}.{__method__}"
64
+ end
65
+
66
+ ##
67
+ # @abstract
68
+ def self.clean_for_tests!
69
+ return false unless Rails.env.test?
70
+ yield
71
+ end
72
+
73
+ ##
74
+ # @abstract
75
+ def self.solr_query(*args)
76
+ raise NotImplementedError, "#{self}.{__method__}"
77
+ end
78
+
79
+ ##
80
+ # @abstract
81
+ def self.solr_name(*args)
82
+ raise NotImplementedError, "#{self}.{__method__}"
83
+ end
84
+
85
+ def self.pdf?(_file_set)
86
+ raise NotImplementedError, "#{self}.{__method__}"
87
+ end
88
+
89
+ def self.create_relationship_between(child_record:, parent_record:)
90
+ raise NotImplementedError, "#{self}.{__method__}"
91
+ end
92
+
93
+ def self.find_by_title_for(title:, model:)
94
+ raise NotImplementedError, "#{self}.{__method__}"
95
+ end
96
+
97
+ def self.find_by(id:)
98
+ raise NotImplementedError, "#{self}.{__method__}"
99
+ end
100
+
101
+ def self.save(object:)
102
+ raise NotImplementedError, "#{self}.{__method__}"
103
+ end
104
+
105
+ def self.index_works(objects:)
106
+ raise NotImplementedError, "#{self}.{__method__}"
107
+ end
108
+
109
+ def self.copy_derivatives_from_data_store(stream:, directives:)
110
+ raise NotImplementedError, "#{self}.{__method__}"
111
+ end
112
+
113
+ def self.extract_text_for(file_set:)
114
+ raise NotImplementedError, "#{self}.{__method__}"
115
+ end
116
+ end
117
+ end
118
+ end
@@ -27,6 +27,17 @@ module IiifPrint
27
27
  new(path).to_a
28
28
  end
29
29
 
30
+ ##
31
+ # @api public
32
+ #
33
+ # Added to allow for fine-tuning of splitting decision such as tenant-based omission
34
+ # @see https://github.com/samvera/hyku/blob/main/app/services/iiif_print/tenant_config.rb
35
+ #
36
+ # @return [Boolean] returns false to not limit the splitting of PDFs
37
+ def self.never_split_pdfs?
38
+ false
39
+ end
40
+
30
41
  class_attribute :image_extension
31
42
  class_attribute :compression, default: nil
32
43
  class_attribute :quality, default: nil
@@ -29,9 +29,10 @@ module IiifPrint
29
29
  return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
30
30
 
31
31
  file_locations = if import_url
32
+ # TODO: Fix this logic, currently unsupported in Bulkrax
32
33
  [Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
33
34
  else
34
- pdf_paths(files: [file.try(:id)&.to_s].compact)
35
+ pdf_paths(file: file)
35
36
  end
36
37
  return :no_pdfs_to_split if file_locations.empty?
37
38
 
@@ -57,15 +58,21 @@ module IiifPrint
57
58
  # Load an array of paths to pdf files
58
59
  # @param [Array > Hyrax::Upload file ids]
59
60
  # @return [Array > String] file paths to temp directory
60
- def self.pdf_paths(files:)
61
- return [] if files.all?(&:empty?) # assumes an array
61
+ def self.pdf_paths(file:)
62
+ return [] unless file
62
63
 
63
- upload_ids = filter_file_ids(files)
64
- return [] if upload_ids.empty?
64
+ if file.class < Valkyrie::Resource
65
+ # assuming that if one PDF is uploaded to a Valkyrie resource then all of them should be
66
+ paths = [Hyrax.storage_adapter.file_path(file.file_identifier)]
67
+ pdfs_only_for(paths)
68
+ else
69
+ upload_ids = filter_file_ids(file.id.to_s)
70
+ return [] if upload_ids.empty?
65
71
 
66
- uploads = Hyrax::UploadedFile.find(upload_ids)
67
- paths = uploads.map(&method(:upload_path))
68
- pdfs_only_for(paths)
72
+ uploads = Hyrax::UploadedFile.find(upload_ids)
73
+ paths = uploads.map(&method(:upload_path))
74
+ pdfs_only_for(paths)
75
+ end
69
76
  end
70
77
 
71
78
  ##
@@ -75,8 +82,11 @@ module IiifPrint
75
82
  # @param [GenericWork, etc] A valid type of hyrax work
76
83
  # @return [Boolean]
77
84
  def self.iiif_print_split?(work:)
85
+ config = work.try(:iiif_print_config)
86
+ return false unless config
87
+ return false if config.pdf_splitter_service.try(:never_split_pdfs?)
78
88
  # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
79
- return true if work.try(:iiif_print_config)&.pdf_split_child_model
89
+ return true if config&.pdf_split_child_model
80
90
  false
81
91
  end
82
92
 
@@ -7,26 +7,15 @@ module IiifPrint
7
7
  ## @api public
8
8
  # @param file_set [FileSet] What is the containing file set for the provided file.
9
9
  # @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
10
- def self.conditionally_destroy_spawned_children_of(file_set:, work:)
10
+ def self.conditionally_destroy_spawned_children_of(file_set:, work:, user: nil)
11
11
  child_model = work.try(:iiif_print_config)&.pdf_split_child_model
12
12
  return unless child_model
13
- return unless file_set.class.pdf_mime_types.include?(file_set.mime_type)
13
+ return unless IiifPrint.pdf?(file_set)
14
14
 
15
+ # NOTE: The IiifPrint::PendingRelationship is an ActiveRecord object; hence we don't need to
16
+ # leverage an adapter.
15
17
  IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
16
- destroy_spawned_children(model: child_model, file_set: file_set, work: work)
17
- end
18
-
19
- private_class_method def self.destroy_spawned_children(model:, file_set:, work:)
20
- # look first for children by the file set id they were split from
21
- children = model.where(split_from_pdf_id: file_set.id)
22
- if children.blank?
23
- # find works where file name and work `to_param` are both in the title
24
- children = model.where(title: file_set.label).where(title: work.to_param)
25
- end
26
- return if children.blank?
27
- children.each do |rcd|
28
- rcd.destroy(eradicate: true)
29
- end
18
+ IiifPrint.destroy_children_split_from(file_set: file_set, work: work, model: child_model, user: user)
30
19
  end
31
20
  end
32
21
  end
@@ -28,13 +28,15 @@ module IiifPrint
28
28
 
29
29
  ocr_derivatives.each do |extension, method_name|
30
30
  path = prepare_path(extension.to_s)
31
- write(content: ocr.public_send(method_name), path: path)
31
+ write(content: ocr.public_send(method_name), path: path, extension: extension)
32
32
  end
33
33
  end
34
34
 
35
- def write(content:, path:)
35
+ def write(content:, path:, extension:)
36
+ mime_type = mime_type_for(extension)
36
37
  File.open(path, 'w') do |outfile|
37
38
  outfile.write(content)
39
+ IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type })
38
40
  end
39
41
  end
40
42
 
@@ -4,9 +4,10 @@ module IiifPrint
4
4
  # NOTE: to keep this from conflicting with TextExtractionDerivativeService,
5
5
  # this class should be invoked by it, not PluggableDerivativeService.
6
6
  class TextFormatsFromALTOService < BaseDerivativeService
7
- self.target_extension = 'tiff'.freeze
7
+ self.target_extension = 'txt'.freeze
8
8
 
9
9
  def save_derivative(destination, data)
10
+ mime_type = mime_type_for(destination)
10
11
  # Load/prepare base of "pairtree" dir structure for extension, fileset
11
12
  prepare_path(destination)
12
13
  #
@@ -17,6 +18,7 @@ module IiifPrint
17
18
  # Write data as UTF-8 encoded text
18
19
  File.open(save_path, "w:UTF-8") do |f|
19
20
  f.write(data)
21
+ IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'extracted_text', mime_type: mime_type })
20
22
  end
21
23
  end
22
24
 
@@ -32,7 +32,9 @@ module IiifPrint
32
32
  source_path += '[0]' if @source_path.ends_with?('pdf')
33
33
  template = use_color? ? COLOR_CMD : GRAY_CMD
34
34
  template = MONO_CMD if one_bit?
35
- format(template, source_file: source_path, out_file: @dest_path)
35
+ data = format(template, source_file: source_path, out_file: @dest_path)
36
+ IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'service_file', mime_type: mime_type_for(target_extension) })
37
+ data
36
38
  end
37
39
 
38
40
  def create_derivatives(filename)
@@ -1,3 +1,3 @@
1
1
  module IiifPrint
2
- VERSION = '1.1.0'.freeze
2
+ VERSION = '2.0.0'.freeze
3
3
  end
data/lib/iiif_print.rb CHANGED
@@ -14,14 +14,14 @@ require "iiif_print/tiff_derivative_service"
14
14
  require "iiif_print/lineage_service"
15
15
  require "iiif_print/metadata"
16
16
  require "iiif_print/works_controller_behavior"
17
- require "iiif_print/jobs/application_job"
18
17
  require "iiif_print/blacklight_iiif_search/annotation_decorator"
19
- require "iiif_print/jobs/child_works_from_pdf_job"
20
- require "iiif_print/jobs/request_split_pdf_job"
21
18
  require "iiif_print/split_pdfs/base_splitter"
22
19
  require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
23
20
  require "iiif_print/split_pdfs/derivative_rodeo_splitter"
24
21
  require "iiif_print/split_pdfs/destroy_pdf_child_works_service"
22
+ require "iiif_print/persistence_layer"
23
+ require "iiif_print/persistence_layer/active_fedora_adapter"
24
+ require "iiif_print/persistence_layer/valkyrie_adapter"
25
25
 
26
26
  # rubocop:disable Metrics/ModuleLength
27
27
  module IiifPrint
@@ -44,46 +44,45 @@ module IiifPrint
44
44
  end
45
45
 
46
46
  class << self
47
- delegate :skip_splitting_pdf_files_that_end_with_these_texts, to: :config
48
- end
49
-
50
- ##
51
- # Return the immediate parent of the given :file_set.
52
- #
53
- # @param file_set [FileSet]
54
- # @return [#work?, Hydra::PCDM::Work]
55
- # @return [NilClass] when no parent is found.
56
- def self.parent_for(file_set)
57
- # fallback to Fedora-stored relationships if work's aggregation of
58
- # file set is not indexed in Solr
59
- file_set.parent || file_set.member_of.find(&:work?)
60
- end
47
+ delegate(
48
+ :persistence_adapter,
49
+ :skip_splitting_pdf_files_that_end_with_these_texts,
50
+ to: :config
51
+ )
61
52
 
62
- ##
63
- # Return the parent's parent of the given :file_set.
64
- #
65
- # @param file_set [FileSet]
66
- # @return [#work?, Hydra::PCDM::Work]
67
- # @return [NilClass] when no grand parent is found.
68
- def self.grandparent_for(file_set)
69
- parent_of_file_set = parent_for(file_set)
70
- # HACK: This is an assumption about the file_set structure, namely that an image page split from
71
- # a PDF is part of a file set that is a child of a work that is a child of a single work. That
72
- # is, it only has one grand parent. Which is a reasonable assumption for IIIF Print but is not
73
- # valid when extended beyond IIIF Print. That is GenericWork does not have a parent method but
74
- # does have a parents method.
75
- parent_of_file_set.try(:parent_works).try(:first) ||
76
- parent_of_file_set.try(:parents).try(:first) ||
77
- parent_of_file_set&.member_of&.find(&:work?)
53
+ delegate(
54
+ :clean_for_tests!,
55
+ :copy_derivatives_from_data_store,
56
+ :create_relationship_between,
57
+ :destroy_children_split_from,
58
+ :extract_text_for,
59
+ :find_by,
60
+ :find_by_title_for,
61
+ :grandparent_for,
62
+ :index_works,
63
+ :object_in_works,
64
+ :object_ordered_works,
65
+ :parent_for,
66
+ :pdf?,
67
+ :save,
68
+ :solr_construct_query,
69
+ :solr_name,
70
+ :solr_query,
71
+ to: :persistence_adapter
72
+ )
78
73
  end
79
74
 
75
+ # NOTE: We use lambdas so we can have default values but also provide a lazy configuration.
76
+ # There are certainly better ways but this is the least intrusive refactor from prior state.
80
77
  DEFAULT_MODEL_CONFIGURATION = {
81
78
  # Split a PDF into individual page images and create a new child work for each image.
82
- pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
83
- pdf_splitter_service: IiifPrint::SplitPdfs::PagesToJpgsSplitter,
84
- derivative_service_plugins: [
85
- IiifPrint::TextExtractionDerivativeService
86
- ]
79
+ pdf_splitter_job: -> { IiifPrint::Jobs::ChildWorksFromPdfJob },
80
+ pdf_splitter_service: -> { IiifPrint::SplitPdfs::PagesToJpgsSplitter },
81
+ derivative_service_plugins: lambda {
82
+ [
83
+ IiifPrint::TextExtractionDerivativeService
84
+ ]
85
+ }
87
86
  }.freeze
88
87
 
89
88
  # This is the record level configuration for PDF split handling.
@@ -127,23 +126,55 @@ module IiifPrint
127
126
  # @see IiifPrint::DEFAULT_MODEL_CONFIGURATION
128
127
  # @todo Because not every job will split PDFs and write to a child model. May want to introduce
129
128
  # an alternative splitting method to create new filesets on the existing work instead of new child works.
129
+ # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
130
130
  def self.model_configuration(**kwargs)
131
131
  Module.new do
132
- def iiif_print_config?
133
- true
132
+ extend ActiveSupport::Concern
133
+
134
+ included do
135
+ work_type = self # In this case self is the class we're mixing the new module into.
136
+
137
+ # Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
138
+ indexer = if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
139
+ IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_with_adapter_logic(work_type: work_type)
140
+ elsif work_type < ActiveFedora::Base
141
+ IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_with_adapter_logic(work_type: work_type)
142
+ else
143
+ raise "Unable to mix '.model_configuration' into #{work_type}"
144
+ end
145
+
146
+ # Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
147
+ if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
148
+ IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_form_with_adapter_logic(work_type: work_type)
149
+ elsif work_type < ActiveFedora::Base
150
+ IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_form_with_adapter_logic(work_type: work_type)
151
+ else
152
+ raise "Unable to mix '.model_configuration' into #{work_type}"
153
+ end
154
+
155
+ # Deriving lineage of objects is a potentially complicated thing. We provide a default
156
+ # service but each work_type's indexer can be configured by amending it's
157
+ # {.iiif_print_lineage_service}.
158
+ indexer.class_attribute(:iiif_print_lineage_service, default: IiifPrint::LineageService) unless indexer.respond_to?(:iiif_print_lineage_service)
159
+ work_type::GeneratedResourceSchema.send(:include, IiifPrint::SetChildFlag) if work_type.const_defined?(:GeneratedResourceSchema)
134
160
  end
135
161
 
136
162
  # We don't know what you may want in your configuration, but from this gems implementation,
137
163
  # we're going to provide the defaults to ensure that it works.
138
164
  DEFAULT_MODEL_CONFIGURATION.each_pair do |key, default_value|
139
- kwargs[key] ||= default_value
165
+ kwargs[key] ||= default_value.call
140
166
  end
141
167
 
142
168
  define_method(:iiif_print_config) do
143
169
  @iiif_print_config ||= ModelConfig.new(**kwargs)
144
170
  end
171
+
172
+ def iiif_print_config?
173
+ true
174
+ end
145
175
  end
146
176
  end
177
+ # rubocop:enable Metrics/MethodLength
147
178
 
148
179
  # @api public
149
180
  #
@@ -262,11 +293,15 @@ module IiifPrint
262
293
  locations = locations.select { |location| split_for_path_suffix?(location, skip_these_endings: skip_these_endings) }
263
294
  return :no_pdfs_for_splitting if locations.empty?
264
295
 
296
+ # Hyrax::FileSet ids are Valkyrie::ID's which can't be passed, so we call id on that and get the string id
297
+ file_set_id = file_set.id.try(:id) || file_set.id
298
+ work_admin_set_id = work.admin_set_id.try(:id) || work.admin_set_id
299
+
265
300
  work.try(:iiif_print_config)&.pdf_splitter_job&.perform_later(
266
- file_set,
301
+ file_set_id,
267
302
  locations,
268
303
  user,
269
- work.admin_set_id,
304
+ work_admin_set_id,
270
305
  0 # A no longer used parameter; but we need to preserve the method signature (for now)
271
306
  )
272
307
  end
@@ -288,4 +323,4 @@ module IiifPrint
288
323
  !path.downcase.end_with?(*skip_these_endings.map(&:downcase))
289
324
  end
290
325
  end
291
- # rubocop:enable Metrics/ModuleLength
326
+ # rubocop:enable Metrics/ModuleLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity