iiif_print 1.1.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile.lock +2 -2
- data/README.md +4 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +1 -1
- data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +37 -22
- data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
- data/{lib → app/jobs}/iiif_print/jobs/child_works_from_pdf_job.rb +14 -9
- data/{lib → app/jobs}/iiif_print/jobs/create_relationships_job.rb +10 -20
- data/app/listeners/iiif_print/listener.rb +31 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +1 -1
- data/app/models/concerns/iiif_print/solr/document.rb +5 -3
- data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +5 -2
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +4 -2
- data/app/services/iiif_print/pluggable_derivative_service.rb +5 -1
- data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
- data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
- data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
- data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
- data/app/views/hyrax/file_sets/_show_actions.html.erb +1 -1
- data/config/initializers/simple_schema_loader.rb +1 -0
- data/config/metadata/child_works_from_pdf_splitting.yaml +17 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +3 -3
- data/iiif_print.gemspec +1 -1
- data/lib/iiif_print/base_derivative_service.rb +13 -2
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +2 -2
- data/lib/iiif_print/catalog_search_builder.rb +2 -2
- data/lib/iiif_print/configuration.rb +65 -5
- data/lib/iiif_print/data/fileset_helper.rb +2 -2
- data/lib/iiif_print/data/work_derivatives.rb +1 -1
- data/lib/iiif_print/engine.rb +46 -2
- data/lib/iiif_print/homepage_search_builder.rb +2 -2
- data/lib/iiif_print/jp2_derivative_service.rb +4 -1
- data/lib/iiif_print/lineage_service.rb +19 -6
- data/lib/iiif_print/pdf_derivative_service.rb +3 -1
- data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
- data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
- data/lib/iiif_print/persistence_layer.rb +118 -0
- data/lib/iiif_print/split_pdfs/base_splitter.rb +11 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +19 -9
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +5 -16
- data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
- data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
- data/lib/iiif_print/tiff_derivative_service.rb +3 -1
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +79 -44
- metadata +19 -192
- data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -40
- data/app/views/hyrax/file_sets/_actions.html.erb +0 -46
- data/bin/rails +0 -13
- data/spec/.keep.txt +0 -1
- data/spec/factories/ability.rb +0 -6
- data/spec/factories/newspaper_issue.rb +0 -7
- data/spec/factories/newspaper_page.rb +0 -7
- data/spec/factories/newspaper_page_solr_document.rb +0 -20
- data/spec/factories/newspaper_title.rb +0 -8
- data/spec/factories/uploaded_pdf_file.rb +0 -9
- data/spec/factories/uploaded_txt_file.rb +0 -9
- data/spec/factories/user.rb +0 -13
- data/spec/fixtures/authorities/licenses.yml +0 -4
- data/spec/fixtures/authorities/rights_statements.yml +0 -4
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +0 -7
- data/spec/fixtures/files/alto-2-0.xsd +0 -714
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +0 -16
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +0 -31
- data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
- data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
- data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +0 -202
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
- data/spec/helpers/iiif_print_helper_spec.rb +0 -43
- data/spec/iiif_print/base_derivative_service_spec.rb +0 -28
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -59
- data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
- data/spec/iiif_print/configuration_spec.rb +0 -193
- data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
- data/spec/iiif_print/data/work_file_spec.rb +0 -99
- data/spec/iiif_print/data/work_files_spec.rb +0 -237
- data/spec/iiif_print/image_tool_spec.rb +0 -109
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -35
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -118
- data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
- data/spec/iiif_print/lineage_service_spec.rb +0 -13
- data/spec/iiif_print/metadata_spec.rb +0 -249
- data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +0 -27
- data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +0 -80
- data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +0 -92
- data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +0 -22
- data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +0 -18
- data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +0 -19
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
- data/spec/iiif_print_spec.rb +0 -171
- data/spec/misc_shared.rb +0 -111
- data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
- data/spec/models/iiif_print/iiif_search_decorator_spec.rb +0 -27
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
- data/spec/models/solr_document_spec.rb +0 -14
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -70
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
- data/spec/samvera/derivatives/configuration_spec.rb +0 -41
- data/spec/samvera/derivatives/hyrax_spec.rb +0 -62
- data/spec/samvera/derivatives_spec.rb +0 -54
- data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +0 -103
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
- data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +0 -20
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -175
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
- data/spec/spec_helper.rb +0 -181
- data/spec/support/controller_level_helpers.rb +0 -28
- data/spec/support/iiif_print_models.rb +0 -127
- data/spec/test_app_templates/blacklight.yml +0 -9
- data/spec/test_app_templates/fedora.yml +0 -15
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
- data/spec/test_app_templates/redis.yml +0 -9
- data/spec/test_app_templates/solr/conf/schema.xml +0 -362
- data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
- data/spec/test_app_templates/solr.yml +0 -7
- /data/{lib → app/jobs}/iiif_print/jobs/request_split_pdf_job.rb +0 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module PersistenceLayer
|
3
|
+
class ValkyrieAdapter < AbstractAdapter
|
4
|
+
##
|
5
|
+
# @param object [Valkyrie::Resource]
|
6
|
+
# @return [Array<Valkyrie::Resource>]
|
7
|
+
def self.object_in_works(object)
|
8
|
+
Array.wrap(Hyrax.custom_queries.find_parent_work(resource: object))
|
9
|
+
end
|
10
|
+
|
11
|
+
##
|
12
|
+
# @param object [Valkyrie::Resource]
|
13
|
+
# @return [Array<Valkyrie::Resource>]
|
14
|
+
def self.object_ordered_works(object)
|
15
|
+
child_file_sets = Hyrax.custom_queries.find_child_file_sets(resource: object).to_a
|
16
|
+
child_works = Hyrax.custom_queries.find_child_works(resource: object).to_a
|
17
|
+
child_works + child_file_sets
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param work_type [Class<Valkyrie::Resource>]
|
22
|
+
# @return the indexer for the given :work_type
|
23
|
+
def self.decorate_with_adapter_logic(work_type:)
|
24
|
+
work_type.send(:include, Hyrax::Schema(:child_works_from_pdf_splitting)) unless work_type.included_modules.include?(Hyrax::Schema(:child_works_from_pdf_splitting))
|
25
|
+
# TODO: Use `Hyrax::ValkyrieIndexer.indexer_class_for` once changes are merged.
|
26
|
+
indexer = "#{work_type}Indexer".constantize
|
27
|
+
indexer.send(:include, Hyrax::Indexer(:child_works_from_pdf_splitting)) unless indexer.included_modules.include?(Hyrax::Indexer(:child_works_from_pdf_splitting))
|
28
|
+
indexer
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# @param work_type [Class<ActiveFedora::Base>]
|
33
|
+
# @return form for the given :work_type
|
34
|
+
def self.decorate_form_with_adapter_logic(work_type:)
|
35
|
+
form = "#{work_type}Form".constantize
|
36
|
+
form.send(:include, Hyrax::FormFields(:child_works_from_pdf_splitting)) unless form.included_modules.include?(Hyrax::FormFields(:child_works_from_pdf_splitting))
|
37
|
+
form
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Return the immediate parent of the given :file_set.
|
42
|
+
#
|
43
|
+
# @param file_set [Hyrax::FileMetadata or FileSet]
|
44
|
+
# @return [#work?, Hydra::PCDM::Work]
|
45
|
+
# @return [NilClass] when no parent is found.
|
46
|
+
def self.parent_for(file_set)
|
47
|
+
file_set = Hyrax.query_service.find_by(id: file_set.file_set_id) if file_set.is_a?(Hyrax::FileMetadata)
|
48
|
+
Hyrax.query_service.find_parents(resource: file_set).first
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Return the parent's parent of the given :file_set.
|
53
|
+
#
|
54
|
+
# @param file_set [Hyrax::FileMetadata or FileSet]
|
55
|
+
# @return [#work?, Hydra::PCDM::Work]
|
56
|
+
# @return [NilClass] when no grand parent is found.
|
57
|
+
def self.grandparent_for(file_set)
|
58
|
+
parent = parent_for(file_set)
|
59
|
+
return nil unless parent
|
60
|
+
Hyrax.query_service.find_parents(resource: parent).first
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.solr_construct_query(*args)
|
64
|
+
Hyrax::SolrQueryBuilderService.construct_query(*args)
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.clean_for_tests!
|
68
|
+
# For Fedora backed repositories, we'll want to consider some cleaning mechanism. For
|
69
|
+
# database backed repositories, we can rely on the database_cleaner gem.
|
70
|
+
raise NotImplementedError
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.solr_query(query, **args)
|
74
|
+
Hyrax::SolrService.query(query, **args)
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.solr_name(field_name)
|
78
|
+
Hyrax.config.index_field_mapper.solr_name(field_name.to_s)
|
79
|
+
end
|
80
|
+
|
81
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
82
|
+
def self.destroy_children_split_from(file_set:, work:, model:, user:)
|
83
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
84
|
+
# look for child records by the file set id they were split from
|
85
|
+
Hyrax.query_service.find_inverse_references_by(resource: file_set, property: :split_from_pdf_id, model: model).each do |child|
|
86
|
+
Hyrax.persister.delete(resource: child)
|
87
|
+
Hyrax.indexing_service.delete(resource: child)
|
88
|
+
Hyrax.publisher.publish('object.deleted', object: child, user: user)
|
89
|
+
end
|
90
|
+
true
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.pdf?(file_set)
|
94
|
+
file_set.original_file&.pdf?
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Add a child record as a member of a parent record
|
99
|
+
#
|
100
|
+
# @param model [child_record] a Valkyrie::Resource model
|
101
|
+
# @param model [parent_record] a Valkyrie::Resource model
|
102
|
+
# @return [TrueClass]
|
103
|
+
def self.create_relationship_between(child_record:, parent_record:)
|
104
|
+
return true if parent_record.member_ids.include?(child_record.id)
|
105
|
+
parent_record.member_ids << child_record.id
|
106
|
+
true
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# find a work by title
|
111
|
+
# We should only find one, but there is no guarantee of that
|
112
|
+
# @param title [String]
|
113
|
+
# @param model [String] a Valkyrie::Resource model
|
114
|
+
# @return [Array<Valkyrie::Resource]
|
115
|
+
def self.find_by_title_for(title:, model:)
|
116
|
+
work_type = model.constantize
|
117
|
+
# TODO: This creates a hard dependency on Bulkrax because that is where this custom query is defined
|
118
|
+
# Is this adequate?
|
119
|
+
Array.wrap(Hyrax.query_service.custom_query.find_by_model_and_property_value(model: work_type,
|
120
|
+
property: :title,
|
121
|
+
value: title))
|
122
|
+
end
|
123
|
+
|
124
|
+
##
|
125
|
+
# find a work or file_set
|
126
|
+
#
|
127
|
+
# @param id [String]
|
128
|
+
def self.find_by(id:)
|
129
|
+
Hyrax.query_service.find_by(id: id)
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# save a work
|
134
|
+
#
|
135
|
+
# @param object [Array<Valkyrie::Resource]
|
136
|
+
def self.save(object:)
|
137
|
+
Hyrax.persister.save(resource: object)
|
138
|
+
Hyrax.index_adapter.save(resource: object)
|
139
|
+
|
140
|
+
Hyrax.publisher.publish('object.membership.updated', object: object, user: object.depositor)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# reindex an array of works and their file_sets
|
145
|
+
#
|
146
|
+
# @param objects [Array<Valkyrie::Resource]
|
147
|
+
# @return [TrueClass]
|
148
|
+
def self.index_works(objects:)
|
149
|
+
objects.each do |work|
|
150
|
+
Hyrax.index_adapter.save(resource: work)
|
151
|
+
Hyrax.custom_queries.find_child_file_sets(resource: work).each do |file_set|
|
152
|
+
Hyrax.index_adapter.save(resource: file_set)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
true
|
156
|
+
end
|
157
|
+
|
158
|
+
##
|
159
|
+
# Performs an extra step to create the Hyrax::Metadata objects
|
160
|
+
# for derivatives.
|
161
|
+
#
|
162
|
+
# @param []
|
163
|
+
# @return [TrueClass]
|
164
|
+
def self.copy_derivatives_from_data_store(stream:, directives:)
|
165
|
+
Hyrax::ValkyriePersistDerivatives.call(stream, directives)
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Extract text from the derivatives
|
170
|
+
#
|
171
|
+
# @param [Hyrax::FileSet] a Valkyrie fileset
|
172
|
+
# @return [String] Text from fileset's file
|
173
|
+
def self.extract_text_for(file_set:)
|
174
|
+
fm = Hyrax.custom_queries.find_many_file_metadata_by_use(resource: file_set,
|
175
|
+
use: Hyrax::FileMetadata::Use.uri_for(use: :extracted_file))
|
176
|
+
return if fm.empty?
|
177
|
+
text_fm = fm.find { |t| t.mime_type == Marcel::MimeType.for(extension: 'txt') }
|
178
|
+
return if text_fm.nil?
|
179
|
+
text_fm.content
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
##
|
3
|
+
# The PersistenceLayer module provides the namespace for other adapters:
|
4
|
+
#
|
5
|
+
# - {IiifPrint::PersistenceLayer::ActiveFedoraAdapter}
|
6
|
+
# - {IiifPrint::PersistenceLayer::ValkyrieAdapter}
|
7
|
+
#
|
8
|
+
# And the defining interface in the {IiifPrint::PersistenceLayer::AbstractAdapter}
|
9
|
+
module PersistenceLayer
|
10
|
+
# @abstract
|
11
|
+
class AbstractAdapter
|
12
|
+
##
|
13
|
+
# @param object [Object]
|
14
|
+
# @return [Array<Object>]
|
15
|
+
def self.object_in_works(object)
|
16
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param object [Object]
|
21
|
+
# @return [Array<Object>]
|
22
|
+
def self.object_ordered_works(object)
|
23
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# @param work_type [Class]
|
28
|
+
# @return the corresponding indexer for the work_type
|
29
|
+
def self.decorate_with_adapter_logic(work_type:)
|
30
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# @param work_type [Class]
|
35
|
+
# @return the corresponding indexer for the work_type
|
36
|
+
def self.decorate_form_with_adapter_logic(work_type:)
|
37
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# @param file_set [Object]
|
42
|
+
# @param work [Object]
|
43
|
+
# @param model [Class] The class name for which we'll split children.
|
44
|
+
def self.destroy_children_split_from(file_set:, work:, model:)
|
45
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# @abstract
|
50
|
+
def self.parent_for(*)
|
51
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @abstract
|
56
|
+
def self.grandparent_for(*)
|
57
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# @abstract
|
62
|
+
def self.solr_field_query(*)
|
63
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# @abstract
|
68
|
+
def self.clean_for_tests!
|
69
|
+
return false unless Rails.env.test?
|
70
|
+
yield
|
71
|
+
end
|
72
|
+
|
73
|
+
##
|
74
|
+
# @abstract
|
75
|
+
def self.solr_query(*args)
|
76
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# @abstract
|
81
|
+
def self.solr_name(*args)
|
82
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.pdf?(_file_set)
|
86
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.create_relationship_between(child_record:, parent_record:)
|
90
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.find_by_title_for(title:, model:)
|
94
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.find_by(id:)
|
98
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.save(object:)
|
102
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.index_works(objects:)
|
106
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.copy_derivatives_from_data_store(stream:, directives:)
|
110
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.extract_text_for(file_set:)
|
114
|
+
raise NotImplementedError, "#{self}.{__method__}"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -27,6 +27,17 @@ module IiifPrint
|
|
27
27
|
new(path).to_a
|
28
28
|
end
|
29
29
|
|
30
|
+
##
|
31
|
+
# @api public
|
32
|
+
#
|
33
|
+
# Added to allow for fine-tuning of splitting decision such as tenant-based omission
|
34
|
+
# @see https://github.com/samvera/hyku/blob/main/app/services/iiif_print/tenant_config.rb
|
35
|
+
#
|
36
|
+
# @return [Boolean] returns false to not limit the splitting of PDFs
|
37
|
+
def self.never_split_pdfs?
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
30
41
|
class_attribute :image_extension
|
31
42
|
class_attribute :compression, default: nil
|
32
43
|
class_attribute :quality, default: nil
|
@@ -29,9 +29,10 @@ module IiifPrint
|
|
29
29
|
return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
|
30
30
|
|
31
31
|
file_locations = if import_url
|
32
|
+
# TODO: Fix this logic, currently unsupported in Bulkrax
|
32
33
|
[Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
|
33
34
|
else
|
34
|
-
pdf_paths(
|
35
|
+
pdf_paths(file: file)
|
35
36
|
end
|
36
37
|
return :no_pdfs_to_split if file_locations.empty?
|
37
38
|
|
@@ -57,15 +58,21 @@ module IiifPrint
|
|
57
58
|
# Load an array of paths to pdf files
|
58
59
|
# @param [Array > Hyrax::Upload file ids]
|
59
60
|
# @return [Array > String] file paths to temp directory
|
60
|
-
def self.pdf_paths(
|
61
|
-
return []
|
61
|
+
def self.pdf_paths(file:)
|
62
|
+
return [] unless file
|
62
63
|
|
63
|
-
|
64
|
-
|
64
|
+
if file.class < Valkyrie::Resource
|
65
|
+
# assuming that if one PDF is uploaded to a Valkyrie resource then all of them should be
|
66
|
+
paths = [Hyrax.storage_adapter.file_path(file.file_identifier)]
|
67
|
+
pdfs_only_for(paths)
|
68
|
+
else
|
69
|
+
upload_ids = filter_file_ids(file.id.to_s)
|
70
|
+
return [] if upload_ids.empty?
|
65
71
|
|
66
|
-
|
67
|
-
|
68
|
-
|
72
|
+
uploads = Hyrax::UploadedFile.find(upload_ids)
|
73
|
+
paths = uploads.map(&method(:upload_path))
|
74
|
+
pdfs_only_for(paths)
|
75
|
+
end
|
69
76
|
end
|
70
77
|
|
71
78
|
##
|
@@ -75,8 +82,11 @@ module IiifPrint
|
|
75
82
|
# @param [GenericWork, etc] A valid type of hyrax work
|
76
83
|
# @return [Boolean]
|
77
84
|
def self.iiif_print_split?(work:)
|
85
|
+
config = work.try(:iiif_print_config)
|
86
|
+
return false unless config
|
87
|
+
return false if config.pdf_splitter_service.try(:never_split_pdfs?)
|
78
88
|
# defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
|
79
|
-
return true if
|
89
|
+
return true if config&.pdf_split_child_model
|
80
90
|
false
|
81
91
|
end
|
82
92
|
|
@@ -7,26 +7,15 @@ module IiifPrint
|
|
7
7
|
## @api public
|
8
8
|
# @param file_set [FileSet] What is the containing file set for the provided file.
|
9
9
|
# @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
|
10
|
-
def self.conditionally_destroy_spawned_children_of(file_set:, work:)
|
10
|
+
def self.conditionally_destroy_spawned_children_of(file_set:, work:, user: nil)
|
11
11
|
child_model = work.try(:iiif_print_config)&.pdf_split_child_model
|
12
12
|
return unless child_model
|
13
|
-
return unless
|
13
|
+
return unless IiifPrint.pdf?(file_set)
|
14
14
|
|
15
|
+
# NOTE: The IiifPrint::PendingRelationship is an ActiveRecord object; hence we don't need to
|
16
|
+
# leverage an adapter.
|
15
17
|
IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
private_class_method def self.destroy_spawned_children(model:, file_set:, work:)
|
20
|
-
# look first for children by the file set id they were split from
|
21
|
-
children = model.where(split_from_pdf_id: file_set.id)
|
22
|
-
if children.blank?
|
23
|
-
# find works where file name and work `to_param` are both in the title
|
24
|
-
children = model.where(title: file_set.label).where(title: work.to_param)
|
25
|
-
end
|
26
|
-
return if children.blank?
|
27
|
-
children.each do |rcd|
|
28
|
-
rcd.destroy(eradicate: true)
|
29
|
-
end
|
18
|
+
IiifPrint.destroy_children_split_from(file_set: file_set, work: work, model: child_model, user: user)
|
30
19
|
end
|
31
20
|
end
|
32
21
|
end
|
@@ -28,13 +28,15 @@ module IiifPrint
|
|
28
28
|
|
29
29
|
ocr_derivatives.each do |extension, method_name|
|
30
30
|
path = prepare_path(extension.to_s)
|
31
|
-
write(content: ocr.public_send(method_name), path: path)
|
31
|
+
write(content: ocr.public_send(method_name), path: path, extension: extension)
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
def write(content:, path:)
|
35
|
+
def write(content:, path:, extension:)
|
36
|
+
mime_type = mime_type_for(extension)
|
36
37
|
File.open(path, 'w') do |outfile|
|
37
38
|
outfile.write(content)
|
39
|
+
IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type })
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
@@ -4,9 +4,10 @@ module IiifPrint
|
|
4
4
|
# NOTE: to keep this from conflicting with TextExtractionDerivativeService,
|
5
5
|
# this class should be invoked by it, not PluggableDerivativeService.
|
6
6
|
class TextFormatsFromALTOService < BaseDerivativeService
|
7
|
-
self.target_extension = '
|
7
|
+
self.target_extension = 'txt'.freeze
|
8
8
|
|
9
9
|
def save_derivative(destination, data)
|
10
|
+
mime_type = mime_type_for(destination)
|
10
11
|
# Load/prepare base of "pairtree" dir structure for extension, fileset
|
11
12
|
prepare_path(destination)
|
12
13
|
#
|
@@ -17,6 +18,7 @@ module IiifPrint
|
|
17
18
|
# Write data as UTF-8 encoded text
|
18
19
|
File.open(save_path, "w:UTF-8") do |f|
|
19
20
|
f.write(data)
|
21
|
+
IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'extracted_text', mime_type: mime_type })
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
@@ -32,7 +32,9 @@ module IiifPrint
|
|
32
32
|
source_path += '[0]' if @source_path.ends_with?('pdf')
|
33
33
|
template = use_color? ? COLOR_CMD : GRAY_CMD
|
34
34
|
template = MONO_CMD if one_bit?
|
35
|
-
format(template, source_file: source_path, out_file: @dest_path)
|
35
|
+
data = format(template, source_file: source_path, out_file: @dest_path)
|
36
|
+
IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'service_file', mime_type: mime_type_for(target_extension) })
|
37
|
+
data
|
36
38
|
end
|
37
39
|
|
38
40
|
def create_derivatives(filename)
|
data/lib/iiif_print/version.rb
CHANGED
data/lib/iiif_print.rb
CHANGED
@@ -14,14 +14,14 @@ require "iiif_print/tiff_derivative_service"
|
|
14
14
|
require "iiif_print/lineage_service"
|
15
15
|
require "iiif_print/metadata"
|
16
16
|
require "iiif_print/works_controller_behavior"
|
17
|
-
require "iiif_print/jobs/application_job"
|
18
17
|
require "iiif_print/blacklight_iiif_search/annotation_decorator"
|
19
|
-
require "iiif_print/jobs/child_works_from_pdf_job"
|
20
|
-
require "iiif_print/jobs/request_split_pdf_job"
|
21
18
|
require "iiif_print/split_pdfs/base_splitter"
|
22
19
|
require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
|
23
20
|
require "iiif_print/split_pdfs/derivative_rodeo_splitter"
|
24
21
|
require "iiif_print/split_pdfs/destroy_pdf_child_works_service"
|
22
|
+
require "iiif_print/persistence_layer"
|
23
|
+
require "iiif_print/persistence_layer/active_fedora_adapter"
|
24
|
+
require "iiif_print/persistence_layer/valkyrie_adapter"
|
25
25
|
|
26
26
|
# rubocop:disable Metrics/ModuleLength
|
27
27
|
module IiifPrint
|
@@ -44,46 +44,45 @@ module IiifPrint
|
|
44
44
|
end
|
45
45
|
|
46
46
|
class << self
|
47
|
-
delegate
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
#
|
53
|
-
# @param file_set [FileSet]
|
54
|
-
# @return [#work?, Hydra::PCDM::Work]
|
55
|
-
# @return [NilClass] when no parent is found.
|
56
|
-
def self.parent_for(file_set)
|
57
|
-
# fallback to Fedora-stored relationships if work's aggregation of
|
58
|
-
# file set is not indexed in Solr
|
59
|
-
file_set.parent || file_set.member_of.find(&:work?)
|
60
|
-
end
|
47
|
+
delegate(
|
48
|
+
:persistence_adapter,
|
49
|
+
:skip_splitting_pdf_files_that_end_with_these_texts,
|
50
|
+
to: :config
|
51
|
+
)
|
61
52
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
53
|
+
delegate(
|
54
|
+
:clean_for_tests!,
|
55
|
+
:copy_derivatives_from_data_store,
|
56
|
+
:create_relationship_between,
|
57
|
+
:destroy_children_split_from,
|
58
|
+
:extract_text_for,
|
59
|
+
:find_by,
|
60
|
+
:find_by_title_for,
|
61
|
+
:grandparent_for,
|
62
|
+
:index_works,
|
63
|
+
:object_in_works,
|
64
|
+
:object_ordered_works,
|
65
|
+
:parent_for,
|
66
|
+
:pdf?,
|
67
|
+
:save,
|
68
|
+
:solr_construct_query,
|
69
|
+
:solr_name,
|
70
|
+
:solr_query,
|
71
|
+
to: :persistence_adapter
|
72
|
+
)
|
78
73
|
end
|
79
74
|
|
75
|
+
# NOTE: We use lambdas so we can have default values but also provide a lazy configuration.
|
76
|
+
# There are certainly better ways but this is the least intrusive refactor from prior state.
|
80
77
|
DEFAULT_MODEL_CONFIGURATION = {
|
81
78
|
# Split a PDF into individual page images and create a new child work for each image.
|
82
|
-
pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
|
83
|
-
pdf_splitter_service: IiifPrint::SplitPdfs::PagesToJpgsSplitter,
|
84
|
-
derivative_service_plugins:
|
85
|
-
|
86
|
-
|
79
|
+
pdf_splitter_job: -> { IiifPrint::Jobs::ChildWorksFromPdfJob },
|
80
|
+
pdf_splitter_service: -> { IiifPrint::SplitPdfs::PagesToJpgsSplitter },
|
81
|
+
derivative_service_plugins: lambda {
|
82
|
+
[
|
83
|
+
IiifPrint::TextExtractionDerivativeService
|
84
|
+
]
|
85
|
+
}
|
87
86
|
}.freeze
|
88
87
|
|
89
88
|
# This is the record level configuration for PDF split handling.
|
@@ -127,23 +126,55 @@ module IiifPrint
|
|
127
126
|
# @see IiifPrint::DEFAULT_MODEL_CONFIGURATION
|
128
127
|
# @todo Because not every job will split PDFs and write to a child model. May want to introduce
|
129
128
|
# an alternative splitting method to create new filesets on the existing work instead of new child works.
|
129
|
+
# rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
130
130
|
def self.model_configuration(**kwargs)
|
131
131
|
Module.new do
|
132
|
-
|
133
|
-
|
132
|
+
extend ActiveSupport::Concern
|
133
|
+
|
134
|
+
included do
|
135
|
+
work_type = self # In this case self is the class we're mixing the new module into.
|
136
|
+
|
137
|
+
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
|
138
|
+
indexer = if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
|
139
|
+
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_with_adapter_logic(work_type: work_type)
|
140
|
+
elsif work_type < ActiveFedora::Base
|
141
|
+
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_with_adapter_logic(work_type: work_type)
|
142
|
+
else
|
143
|
+
raise "Unable to mix '.model_configuration' into #{work_type}"
|
144
|
+
end
|
145
|
+
|
146
|
+
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
|
147
|
+
if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
|
148
|
+
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_form_with_adapter_logic(work_type: work_type)
|
149
|
+
elsif work_type < ActiveFedora::Base
|
150
|
+
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_form_with_adapter_logic(work_type: work_type)
|
151
|
+
else
|
152
|
+
raise "Unable to mix '.model_configuration' into #{work_type}"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Deriving lineage of objects is a potentially complicated thing. We provide a default
|
156
|
+
# service but each work_type's indexer can be configured by amending it's
|
157
|
+
# {.iiif_print_lineage_service}.
|
158
|
+
indexer.class_attribute(:iiif_print_lineage_service, default: IiifPrint::LineageService) unless indexer.respond_to?(:iiif_print_lineage_service)
|
159
|
+
work_type::GeneratedResourceSchema.send(:include, IiifPrint::SetChildFlag) if work_type.const_defined?(:GeneratedResourceSchema)
|
134
160
|
end
|
135
161
|
|
136
162
|
# We don't know what you may want in your configuration, but from this gems implementation,
|
137
163
|
# we're going to provide the defaults to ensure that it works.
|
138
164
|
DEFAULT_MODEL_CONFIGURATION.each_pair do |key, default_value|
|
139
|
-
kwargs[key] ||= default_value
|
165
|
+
kwargs[key] ||= default_value.call
|
140
166
|
end
|
141
167
|
|
142
168
|
define_method(:iiif_print_config) do
|
143
169
|
@iiif_print_config ||= ModelConfig.new(**kwargs)
|
144
170
|
end
|
171
|
+
|
172
|
+
def iiif_print_config?
|
173
|
+
true
|
174
|
+
end
|
145
175
|
end
|
146
176
|
end
|
177
|
+
# rubocop:enable Metrics/MethodLength
|
147
178
|
|
148
179
|
# @api public
|
149
180
|
#
|
@@ -262,11 +293,15 @@ module IiifPrint
|
|
262
293
|
locations = locations.select { |location| split_for_path_suffix?(location, skip_these_endings: skip_these_endings) }
|
263
294
|
return :no_pdfs_for_splitting if locations.empty?
|
264
295
|
|
296
|
+
# Hyrax::FileSet ids are Valkyrie::ID's which can't be passed, so we call id on that and get the string id
|
297
|
+
file_set_id = file_set.id.try(:id) || file_set.id
|
298
|
+
work_admin_set_id = work.admin_set_id.try(:id) || work.admin_set_id
|
299
|
+
|
265
300
|
work.try(:iiif_print_config)&.pdf_splitter_job&.perform_later(
|
266
|
-
|
301
|
+
file_set_id,
|
267
302
|
locations,
|
268
303
|
user,
|
269
|
-
|
304
|
+
work_admin_set_id,
|
270
305
|
0 # A no longer used parameter; but we need to preserve the method signature (for now)
|
271
306
|
)
|
272
307
|
end
|
@@ -288,4 +323,4 @@ module IiifPrint
|
|
288
323
|
!path.downcase.end_with?(*skip_these_endings.map(&:downcase))
|
289
324
|
end
|
290
325
|
end
|
291
|
-
# rubocop:enable Metrics/ModuleLength
|
326
|
+
# rubocop:enable Metrics/ModuleLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|