curation_concerns-models 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.md +177 -0
- data/README.md +42 -0
- data/Rakefile +1 -0
- data/app/actors/concerns/curation_concerns/manages_embargoes_actor.rb +99 -0
- data/app/actors/curation_concerns/base_actor.rb +90 -0
- data/app/actors/curation_concerns/generic_file_actor.rb +150 -0
- data/app/actors/curation_concerns/work_actor_behavior.rb +88 -0
- data/app/jobs/active_fedora_id_based_job.rb +22 -0
- data/app/jobs/active_fedora_pid_based_job.rb +6 -0
- data/app/jobs/audit_job.rb +58 -0
- data/app/jobs/characterize_job.rb +11 -0
- data/app/jobs/copy_permissions_job.rb +24 -0
- data/app/jobs/create_derivatives_job.rb +15 -0
- data/app/jobs/import_url_job.rb +56 -0
- data/app/jobs/ingest_local_file_job.rb +48 -0
- data/app/jobs/resolrize_job.rb +9 -0
- data/app/models/checksum_audit_log.rb +21 -0
- data/app/models/concerns/curation_concerns/ability.rb +34 -0
- data/app/models/concerns/curation_concerns/basic_metadata.rb +87 -0
- data/app/models/concerns/curation_concerns/collection_behavior.rb +47 -0
- data/app/models/concerns/curation_concerns/generic_file/belongs_to_works.rb +53 -0
- data/app/models/concerns/curation_concerns/generic_file/characterization.rb +89 -0
- data/app/models/concerns/curation_concerns/generic_file/content.rb +8 -0
- data/app/models/concerns/curation_concerns/generic_file/export.rb +343 -0
- data/app/models/concerns/curation_concerns/generic_file/full_text_indexing.rb +12 -0
- data/app/models/concerns/curation_concerns/generic_file/indexing.rb +14 -0
- data/app/models/concerns/curation_concerns/generic_file/versions.rb +16 -0
- data/app/models/concerns/curation_concerns/generic_file.rb +5 -0
- data/app/models/concerns/curation_concerns/generic_file_behavior.rb +44 -0
- data/app/models/concerns/curation_concerns/generic_work_behavior.rb +38 -0
- data/app/models/concerns/curation_concerns/has_representative.rb +14 -0
- data/app/models/concerns/curation_concerns/human_readable_type.rb +23 -0
- data/app/models/concerns/curation_concerns/permissions/readable.rb +19 -0
- data/app/models/concerns/curation_concerns/permissions/writable.rb +75 -0
- data/app/models/concerns/curation_concerns/permissions.rb +7 -0
- data/app/models/concerns/curation_concerns/serializers.rb +15 -0
- data/app/models/concerns/curation_concerns/solr_document_behavior.rb +135 -0
- data/app/models/concerns/curation_concerns/user.rb +65 -0
- data/app/models/concerns/curation_concerns/with_basic_metadata.rb +98 -0
- data/app/models/concerns/curation_concerns/with_generic_files.rb +29 -0
- data/app/models/curation_concerns/classify_concern.rb +47 -0
- data/app/models/curation_concerns/quick_classification_query.rb +31 -0
- data/app/models/datastreams/fits_datastream.rb +148 -0
- data/app/models/version_committer.rb +2 -0
- data/app/services/curation_concerns/characterization_service.rb +71 -0
- data/app/services/curation_concerns/full_text_extraction_service.rb +38 -0
- data/app/services/curation_concerns/generic_file_audit_service.rb +85 -0
- data/app/services/curation_concerns/generic_file_indexing_service.rb +14 -0
- data/app/services/curation_concerns/generic_work_indexing_service.rb +16 -0
- data/app/services/curation_concerns/noid.rb +23 -0
- data/app/services/curation_concerns/repository_audit_service.rb +9 -0
- data/app/services/curation_concerns/versioning_service.rb +27 -0
- data/config/locales/curation_concerns.en.yml +6 -0
- data/curation_concerns-models.gemspec +34 -0
- data/lib/curation_concerns/messages.rb +66 -0
- data/lib/curation_concerns/models/engine.rb +61 -0
- data/lib/curation_concerns/models/resque.rb +36 -0
- data/lib/curation_concerns/models/utils.rb +22 -0
- data/lib/curation_concerns/models/version.rb +5 -0
- data/lib/curation_concerns/models.rb +32 -0
- data/lib/generators/curation_concerns/models/abstract_migration_generator.rb +30 -0
- data/lib/generators/curation_concerns/models/clamav_generator.rb +19 -0
- data/lib/generators/curation_concerns/models/fulltext_generator.rb +28 -0
- data/lib/generators/curation_concerns/models/install_generator.rb +70 -0
- data/lib/generators/curation_concerns/models/templates/app/models/collection.rb +4 -0
- data/lib/generators/curation_concerns/models/templates/app/models/generic_file.rb +4 -0
- data/lib/generators/curation_concerns/models/templates/config/clamav.rb +1 -0
- data/lib/generators/curation_concerns/models/templates/config/curation_concerns.rb +123 -0
- data/lib/generators/curation_concerns/models/templates/config/mime_types.rb +6 -0
- data/lib/generators/curation_concerns/models/templates/config/redis.yml +9 -0
- data/lib/generators/curation_concerns/models/templates/config/redis_config.rb +32 -0
- data/lib/generators/curation_concerns/models/templates/config/resque-pool.yml +1 -0
- data/lib/generators/curation_concerns/models/templates/config/resque_admin.rb +10 -0
- data/lib/generators/curation_concerns/models/templates/config/resque_config.rb +5 -0
- data/lib/generators/curation_concerns/models/templates/migrations/create_checksum_audit_logs.rb +19 -0
- data/lib/generators/curation_concerns/models/templates/migrations/create_version_committers.rb +15 -0
- data/lib/tasks/curation_concerns-models_tasks.rake +75 -0
- data/lib/tasks/migrate.rake +13 -0
- data/lib/tasks/resque.rake +13 -0
- data/lib/tasks/solr_reindex.rake +8 -0
- metadata +282 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
module CurationConcerns
|
2
|
+
module SolrDocumentBehavior
|
3
|
+
def title_or_label
|
4
|
+
title || label
|
5
|
+
end
|
6
|
+
|
7
|
+
##
|
8
|
+
# Give our SolrDocument an ActiveModel::Naming appropriate route_key
|
9
|
+
def route_key
|
10
|
+
get(Solrizer.solr_name('has_model', :symbol)).split(':').last.downcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_param
|
14
|
+
id
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
title_or_label
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Offer the source (ActiveFedora-based) model to Rails for some of the
|
23
|
+
# Rails methods (e.g. link_to).
|
24
|
+
# @example
|
25
|
+
# link_to '...', SolrDocument(:id => 'bXXXXXX5').new => <a href="/dams_object/bXXXXXX5">...</a>
|
26
|
+
def to_model
|
27
|
+
@model ||= begin
|
28
|
+
m = ActiveFedora::Base.load_instance_from_solr(id, self)
|
29
|
+
m.class == ActiveFedora::Base ? self : m
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def collection?
|
34
|
+
hydra_model == 'Collection'
|
35
|
+
end
|
36
|
+
|
37
|
+
# Method to return the ActiveFedora model
|
38
|
+
def hydra_model
|
39
|
+
self[Solrizer.solr_name('active_fedora_model', Solrizer::Descriptor.new(:string, :stored, :indexed))]
|
40
|
+
end
|
41
|
+
|
42
|
+
def human_readable_type
|
43
|
+
Array(self[Solrizer.solr_name('human_readable_type', :stored_searchable)]).first
|
44
|
+
end
|
45
|
+
|
46
|
+
def representative
|
47
|
+
Array(self[Solrizer.solr_name('representative', :stored_searchable)]).first
|
48
|
+
end
|
49
|
+
|
50
|
+
def date_uploaded
|
51
|
+
field = self[Solrizer.solr_name("date_uploaded", :stored_sortable, type: :date)]
|
52
|
+
return unless field.present?
|
53
|
+
begin
|
54
|
+
Date.parse(field).to_formatted_s(:standard)
|
55
|
+
rescue
|
56
|
+
Rails.logger.info "Unable to parse date: #{field.first.inspect} for #{self['id']}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def depositor(default = '')
|
61
|
+
val = Array(self[Solrizer.solr_name("depositor")]).first
|
62
|
+
val.present? ? val : default
|
63
|
+
end
|
64
|
+
|
65
|
+
def title
|
66
|
+
Array(self[Solrizer.solr_name('title')]).first
|
67
|
+
end
|
68
|
+
|
69
|
+
def description
|
70
|
+
Array(self[Solrizer.solr_name('description')]).first
|
71
|
+
end
|
72
|
+
|
73
|
+
def label
|
74
|
+
Array(self[Solrizer.solr_name('label')]).first
|
75
|
+
end
|
76
|
+
|
77
|
+
def file_format
|
78
|
+
Array(self[Solrizer.solr_name('file_format')]).first
|
79
|
+
end
|
80
|
+
|
81
|
+
def creator
|
82
|
+
Array(self[Solrizer.solr_name("creator")]).first
|
83
|
+
end
|
84
|
+
|
85
|
+
def tags
|
86
|
+
Array(self[Solrizer.solr_name("tag")])
|
87
|
+
end
|
88
|
+
|
89
|
+
def resource_type
|
90
|
+
Array(self[Solrizer.solr_name("resource_type")])
|
91
|
+
end
|
92
|
+
|
93
|
+
def mime_type
|
94
|
+
Array(self[Solrizer.solr_name("mime_type")]).first
|
95
|
+
end
|
96
|
+
|
97
|
+
def read_groups
|
98
|
+
Array(self[::Ability.read_group_field])
|
99
|
+
end
|
100
|
+
|
101
|
+
def edit_groups
|
102
|
+
Array(self[::Ability.edit_group_field])
|
103
|
+
end
|
104
|
+
|
105
|
+
def edit_people
|
106
|
+
Array(self[::Ability.edit_user_field])
|
107
|
+
end
|
108
|
+
|
109
|
+
def public?
|
110
|
+
read_groups.include?('public')
|
111
|
+
end
|
112
|
+
|
113
|
+
def registered?
|
114
|
+
read_groups.include?('registered')
|
115
|
+
end
|
116
|
+
|
117
|
+
def pdf?
|
118
|
+
['application/pdf'].include? self.mime_type
|
119
|
+
end
|
120
|
+
|
121
|
+
def image?
|
122
|
+
['image/png','image/jpeg', 'image/jpg', 'image/jp2', 'image/bmp', 'image/gif', 'image/tiff'].include? self.mime_type
|
123
|
+
end
|
124
|
+
|
125
|
+
def video?
|
126
|
+
['video/mpeg', 'video/mp4', 'video/webm', 'video/x-msvideo', 'video/avi', 'video/quicktime', 'application/mxf'].include? self.mime_type
|
127
|
+
end
|
128
|
+
|
129
|
+
def audio?
|
130
|
+
# audio/x-wave is the mime type that fits 0.6.0 returns for a wav file.
|
131
|
+
# audio/mpeg is the mime type that fits 0.6.0 returns for an mp3 file.
|
132
|
+
['audio/mp3', 'audio/mpeg', 'audio/x-wave', 'audio/x-wav', 'audio/ogg'].include? self.mime_type
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module CurationConcerns::User
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
# Copied piecemeal from the pcdm branch of sufia-models. More may yet be necessary.
|
5
|
+
|
6
|
+
included do
|
7
|
+
# Connects this user object to Blacklight's Bookmarks and Folders.
|
8
|
+
include Blacklight::User
|
9
|
+
include Hydra::User
|
10
|
+
|
11
|
+
delegate :can?, :cannot?, to: :ability
|
12
|
+
|
13
|
+
attr_accessor :update_directory
|
14
|
+
end
|
15
|
+
|
16
|
+
# Format the json for select2 which requires just an id and a field called text.
|
17
|
+
# If we need an alternate format we should probably look at a json template gem
|
18
|
+
def as_json(opts = nil)
|
19
|
+
{ id: user_key, text: display_name ? "#{display_name} (#{user_key})" : user_key }
|
20
|
+
end
|
21
|
+
|
22
|
+
# Populate user instance with attributes from remote system (e.g., LDAP)
|
23
|
+
# There is no default implementation -- override this in your application
|
24
|
+
# def populate_attributes
|
25
|
+
# end
|
26
|
+
|
27
|
+
def email_address
|
28
|
+
self.email
|
29
|
+
end
|
30
|
+
|
31
|
+
def name
|
32
|
+
self.display_name.titleize || raise
|
33
|
+
rescue
|
34
|
+
self.user_key
|
35
|
+
end
|
36
|
+
|
37
|
+
# Redefine this for more intuitive keys in Redis
|
38
|
+
def to_param
|
39
|
+
# hack because rails doesn't like periods in urls.
|
40
|
+
user_key.gsub(/\./, '-dot-')
|
41
|
+
end
|
42
|
+
|
43
|
+
# The basic groups method, override or will fallback to S ufia::Ldap::User
|
44
|
+
# def groups
|
45
|
+
# @groups ||= self.group_list ? self.group_list.split(";?;") : []
|
46
|
+
# end
|
47
|
+
|
48
|
+
def ability
|
49
|
+
@ability ||= ::Ability.new(self)
|
50
|
+
end
|
51
|
+
|
52
|
+
module ClassMethods
|
53
|
+
def current
|
54
|
+
Thread.current[:user]
|
55
|
+
end
|
56
|
+
|
57
|
+
def current=(user)
|
58
|
+
Thread.current[:user] = user
|
59
|
+
end
|
60
|
+
|
61
|
+
# def from_url_component(component)
|
62
|
+
# User.find_by_user_key(component.gsub(/-dot-/, '.'))
|
63
|
+
# end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module CurationConcerns
|
2
|
+
# This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
|
3
|
+
# * title & description are single-value instead of multivalue
|
4
|
+
module DefaultMetadata
|
5
|
+
extend ActiveSupport::Concern
|
6
|
+
|
7
|
+
included do
|
8
|
+
|
9
|
+
property :label, predicate: ::RDF::DC.title, multiple: false
|
10
|
+
|
11
|
+
property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
|
12
|
+
index.as :symbol, :stored_searchable
|
13
|
+
end
|
14
|
+
|
15
|
+
property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
|
16
|
+
|
17
|
+
property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
|
18
|
+
index.as :symbol
|
19
|
+
end
|
20
|
+
|
21
|
+
property :part_of, predicate: ::RDF::DC.isPartOf
|
22
|
+
property :resource_type, predicate: ::RDF::DC.type do |index|
|
23
|
+
index.as :stored_searchable, :facetable
|
24
|
+
end
|
25
|
+
property :title, predicate: ::RDF::DC.title, multiple:false do |index|
|
26
|
+
index.as :stored_searchable, :facetable
|
27
|
+
end
|
28
|
+
property :creator, predicate: ::RDF::DC.creator do |index|
|
29
|
+
index.as :stored_searchable, :facetable
|
30
|
+
end
|
31
|
+
property :contributor, predicate: ::RDF::DC.contributor do |index|
|
32
|
+
index.as :stored_searchable, :facetable
|
33
|
+
end
|
34
|
+
property :description, predicate: ::RDF::DC.description, multiple: false do |index|
|
35
|
+
index.type :text
|
36
|
+
index.as :stored_searchable
|
37
|
+
end
|
38
|
+
property :tag, predicate: ::RDF::DC.relation do |index|
|
39
|
+
index.as :stored_searchable, :facetable
|
40
|
+
end
|
41
|
+
property :rights, predicate: ::RDF::DC.rights do |index|
|
42
|
+
index.as :stored_searchable
|
43
|
+
end
|
44
|
+
property :publisher, predicate: ::RDF::DC.publisher do |index|
|
45
|
+
index.as :stored_searchable, :facetable
|
46
|
+
end
|
47
|
+
property :date_created, predicate: ::RDF::DC.created do |index|
|
48
|
+
index.as :stored_searchable
|
49
|
+
end
|
50
|
+
property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
|
51
|
+
index.type :date
|
52
|
+
index.as :stored_sortable
|
53
|
+
end
|
54
|
+
property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
|
55
|
+
index.type :date
|
56
|
+
index.as :stored_sortable
|
57
|
+
end
|
58
|
+
property :subject, predicate: ::RDF::DC.subject do |index|
|
59
|
+
index.as :stored_searchable, :facetable
|
60
|
+
end
|
61
|
+
property :language, predicate: ::RDF::DC.language do |index|
|
62
|
+
index.as :stored_searchable, :facetable
|
63
|
+
end
|
64
|
+
property :identifier, predicate: ::RDF::DC.identifier do |index|
|
65
|
+
index.as :stored_searchable
|
66
|
+
end
|
67
|
+
property :based_near, predicate: ::RDF::FOAF.based_near do |index|
|
68
|
+
index.as :stored_searchable, :facetable
|
69
|
+
end
|
70
|
+
property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
|
71
|
+
index.as :stored_searchable
|
72
|
+
end
|
73
|
+
property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
|
74
|
+
index.as :stored_searchable
|
75
|
+
end
|
76
|
+
property :source, predicate: ::RDF::DC.source do |index|
|
77
|
+
index.as :stored_searchable
|
78
|
+
end
|
79
|
+
|
80
|
+
# TODO: Move this somewhere more appropriate
|
81
|
+
begin
|
82
|
+
LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
|
83
|
+
LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
|
84
|
+
LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
|
85
|
+
rescue
|
86
|
+
puts "tables for vocabularies missing"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Add a schema.org itemtype
|
91
|
+
def itemtype
|
92
|
+
# Look up the first non-empty resource type value in a hash from the config
|
93
|
+
CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
|
94
|
+
rescue
|
95
|
+
'http://schema.org/CreativeWork'
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copied from Curate
|
2
|
+
module CurationConcerns
|
3
|
+
module WithGenericFiles
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
# The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
|
8
|
+
before_destroy :before_destroy_cleanup_generic_files
|
9
|
+
end
|
10
|
+
|
11
|
+
# Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
|
12
|
+
# At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
|
13
|
+
def generic_file_ids
|
14
|
+
generic_files.map { |generic_file| generic_file.id }
|
15
|
+
end
|
16
|
+
|
17
|
+
def before_destroy_cleanup_generic_files
|
18
|
+
generic_files.each(&:destroy)
|
19
|
+
end
|
20
|
+
|
21
|
+
def copy_visibility_to_files
|
22
|
+
generic_files.each do |gf|
|
23
|
+
gf.visibility = visibility
|
24
|
+
gf.save!
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'active_attr'
|
2
|
+
module CurationConcerns
|
3
|
+
class ClassifyConcern
|
4
|
+
include ActiveAttr::Model
|
5
|
+
attribute :curation_concern_type
|
6
|
+
|
7
|
+
validates(
|
8
|
+
:curation_concern_type,
|
9
|
+
presence: true,
|
10
|
+
inclusion: { in: lambda { |record| record.registered_curation_concern_types } }
|
11
|
+
)
|
12
|
+
|
13
|
+
def all_curation_concern_classes
|
14
|
+
registered_curation_concern_types.sort.map { |c| self.class.to_class(c) }
|
15
|
+
end
|
16
|
+
|
17
|
+
def registered_curation_concern_types
|
18
|
+
CurationConcerns.configuration.registered_curation_concern_types
|
19
|
+
end
|
20
|
+
|
21
|
+
def possible_curation_concern_types
|
22
|
+
registered_curation_concern_types.collect do |concern|
|
23
|
+
[self.class.to_class(concern).human_readable_type, concern]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def curation_concern_class
|
28
|
+
if possible_curation_concern_types.detect{|name, class_name|
|
29
|
+
class_name == curation_concern_type
|
30
|
+
}
|
31
|
+
self.class.to_class(curation_concern_type)
|
32
|
+
else
|
33
|
+
raise RuntimeError, "Invalid :curation_concern_type"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.to_class(type)
|
38
|
+
# TODO we may want to allow a different (or nil) namespace
|
39
|
+
type.camelize.constantize
|
40
|
+
# begin
|
41
|
+
# "::#{type.camelize}".constantize
|
42
|
+
# rescue NameError
|
43
|
+
# "CurationConcerns::#{type}".constantize
|
44
|
+
# end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module CurationConcerns
|
2
|
+
class QuickClassificationQuery
|
3
|
+
|
4
|
+
def self.each_for_context(*args, &block)
|
5
|
+
new(*args).all.each(&block)
|
6
|
+
end
|
7
|
+
|
8
|
+
attr_reader :user
|
9
|
+
|
10
|
+
def initialize(user, options = {})
|
11
|
+
@user = user
|
12
|
+
@concern_name_normalizer = options.fetch(:concern_name_normalizer, ClassifyConcern.method(:to_class))
|
13
|
+
@registered_curation_concern_names = options.fetch(:registered_curation_concern_names, CurationConcerns.configuration.registered_curation_concern_types)
|
14
|
+
end
|
15
|
+
|
16
|
+
def all
|
17
|
+
ActiveFedora::Base.logger.debug "User is #{user}"
|
18
|
+
ActiveFedora::Base.logger.debug "try is #{normalized_curation_concern_names.first}"
|
19
|
+
ActiveFedora::Base.logger.debug "can is #{user.can?(:create, normalized_curation_concern_names.first)}"
|
20
|
+
normalized_curation_concern_names.select {|klass| user.can?(:create, klass)}
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :concern_name_normalizer, :registered_curation_concern_names
|
26
|
+
|
27
|
+
def normalized_curation_concern_names
|
28
|
+
registered_curation_concern_names.collect{|name| concern_name_normalizer.call(name) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
class FitsDatastream < ActiveFedora::OmDatastream
|
2
|
+
include OM::XML::Document
|
3
|
+
|
4
|
+
set_terminology do |t|
|
5
|
+
t.root(path: "fits",
|
6
|
+
xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
|
7
|
+
schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
|
8
|
+
t.identification {
|
9
|
+
t.identity {
|
10
|
+
t.format_label(path: {attribute: "format"})
|
11
|
+
t.mime_type(path: {attribute: "mimetype"})
|
12
|
+
}
|
13
|
+
}
|
14
|
+
t.fileinfo {
|
15
|
+
t.file_size(path: "size")
|
16
|
+
t.last_modified(path: "lastmodified")
|
17
|
+
t.filename(path: "filename")
|
18
|
+
t.original_checksum(path: "md5checksum")
|
19
|
+
t.rights_basis(path: "rightsBasis")
|
20
|
+
t.copyright_basis(path: "copyrightBasis")
|
21
|
+
t.copyright_note(path: "copyrightNote")
|
22
|
+
}
|
23
|
+
t.filestatus {
|
24
|
+
t.well_formed(path: "well-formed")
|
25
|
+
t.valid(path: "valid")
|
26
|
+
t.status_message(path: "message")
|
27
|
+
}
|
28
|
+
t.metadata {
|
29
|
+
t.document {
|
30
|
+
t.file_title(path: "title")
|
31
|
+
t.file_author(path: "author")
|
32
|
+
t.file_language(path: "language")
|
33
|
+
t.page_count(path: "pageCount")
|
34
|
+
t.word_count(path: "wordCount")
|
35
|
+
t.character_count(path: "characterCount")
|
36
|
+
t.paragraph_count(path: "paragraphCount")
|
37
|
+
t.line_count(path: "lineCount")
|
38
|
+
t.table_count(path: "tableCount")
|
39
|
+
t.graphics_count(path: "graphicsCount")
|
40
|
+
}
|
41
|
+
t.image {
|
42
|
+
t.byte_order(path: "byteOrder")
|
43
|
+
t.compression(path: "compressionScheme")
|
44
|
+
t.width(path: "imageWidth")
|
45
|
+
t.height(path: "imageHeight")
|
46
|
+
t.color_space(path: "colorSpace")
|
47
|
+
t.profile_name(path: "iccProfileName")
|
48
|
+
t.profile_version(path: "iccProfileVersion")
|
49
|
+
t.orientation(path: "orientation")
|
50
|
+
t.color_map(path: "colorMap")
|
51
|
+
t.image_producer(path: "imageProducer")
|
52
|
+
t.capture_device(path: "captureDevice")
|
53
|
+
t.scanning_software(path: "scanningSoftwareName")
|
54
|
+
t.exif_version(path: "exifVersion")
|
55
|
+
t.gps_timestamp(path: "gpsTimeStamp")
|
56
|
+
t.latitude(path: "gpsDestLatitude")
|
57
|
+
t.longitude(path: "gpsDestLongitude")
|
58
|
+
}
|
59
|
+
t.text {
|
60
|
+
t.character_set(path: "charset")
|
61
|
+
t.markup_basis(path: "markupBasis")
|
62
|
+
t.markup_language(path: "markupLanguage")
|
63
|
+
}
|
64
|
+
t.audio {
|
65
|
+
t.duration(path: "duration")
|
66
|
+
t.bit_depth(path: "bitDepth")
|
67
|
+
t.sample_rate(path: "sampleRate")
|
68
|
+
t.channels(path: "channels")
|
69
|
+
t.data_format(path: "dataFormatType")
|
70
|
+
t.offset(path: "offset")
|
71
|
+
}
|
72
|
+
t.video {
|
73
|
+
t.width(path: "imageWidth")
|
74
|
+
t.height(path: "imageHeight")
|
75
|
+
t.duration(path: "duration")
|
76
|
+
t.sample_rate(path: "sampleRate")
|
77
|
+
t.frame_rate(path: "frameRate")
|
78
|
+
}
|
79
|
+
}
|
80
|
+
t.format_label(proxy: [:identification, :identity, :format_label])
|
81
|
+
t.mime_type(proxy: [:identification, :identity, :mime_type])
|
82
|
+
t.file_size(proxy: [:fileinfo, :file_size])
|
83
|
+
t.last_modified(proxy: [:fileinfo, :last_modified])
|
84
|
+
t.filename(proxy: [:fileinfo, :filename])
|
85
|
+
t.original_checksum(proxy: [:fileinfo, :original_checksum])
|
86
|
+
t.rights_basis(proxy: [:fileinfo, :rights_basis])
|
87
|
+
t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
|
88
|
+
t.copyright_note(proxy: [:fileinfo, :copyright_note])
|
89
|
+
t.well_formed(proxy: [:filestatus, :well_formed])
|
90
|
+
t.valid(proxy: [:filestatus, :valid])
|
91
|
+
t.status_message(proxy: [:filestatus, :status_message])
|
92
|
+
t.file_title(proxy: [:metadata, :document, :file_title])
|
93
|
+
t.file_author(proxy: [:metadata, :document, :file_author])
|
94
|
+
t.page_count(proxy: [:metadata, :document, :page_count])
|
95
|
+
t.file_language(proxy: [:metadata, :document, :file_language])
|
96
|
+
t.word_count(proxy: [:metadata, :document, :word_count])
|
97
|
+
t.character_count(proxy: [:metadata, :document, :character_count])
|
98
|
+
t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
|
99
|
+
t.line_count(proxy: [:metadata, :document, :line_count])
|
100
|
+
t.table_count(proxy: [:metadata, :document, :table_count])
|
101
|
+
t.graphics_count(proxy: [:metadata, :document, :graphics_count])
|
102
|
+
t.byte_order(proxy: [:metadata, :image, :byte_order])
|
103
|
+
t.compression(proxy: [:metadata, :image, :compression])
|
104
|
+
t.width(proxy: [:metadata, :image, :width])
|
105
|
+
t.video_width( proxy: [:metadata, :video, :width])
|
106
|
+
t.height(proxy: [:metadata, :image, :height])
|
107
|
+
t.video_height(proxy: [:metadata, :video, :height])
|
108
|
+
t.color_space(proxy: [:metadata, :image, :color_space])
|
109
|
+
t.profile_name(proxy: [:metadata, :image, :profile_name])
|
110
|
+
t.profile_version(proxy: [:metadata, :image, :profile_version])
|
111
|
+
t.orientation(proxy: [:metadata, :image, :orientation])
|
112
|
+
t.color_map(proxy: [:metadata, :image, :color_map])
|
113
|
+
t.image_producer(proxy: [:metadata, :image, :image_producer])
|
114
|
+
t.capture_device(proxy: [:metadata, :image, :capture_device])
|
115
|
+
t.scanning_software(proxy: [:metadata, :image, :scanning_software])
|
116
|
+
t.exif_version(proxy: [:metadata, :image, :exif_version])
|
117
|
+
t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
|
118
|
+
t.latitude(proxy: [:metadata, :image, :latitude])
|
119
|
+
t.longitude(proxy: [:metadata, :image, :longitude])
|
120
|
+
t.character_set(proxy: [:metadata, :text, :character_set])
|
121
|
+
t.markup_basis(proxy: [:metadata, :text, :markup_basis])
|
122
|
+
t.markup_language(proxy: [:metadata, :text, :markup_language])
|
123
|
+
t.duration(proxy: [:metadata, :audio, :duration])
|
124
|
+
t.video_duration(proxy: [:metadata, :video, :duration])
|
125
|
+
t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
|
126
|
+
t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
|
127
|
+
t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
|
128
|
+
t.channels(proxy: [:metadata, :audio, :channels])
|
129
|
+
t.data_format(proxy: [:metadata, :audio, :data_format])
|
130
|
+
t.offset(proxy: [:metadata, :audio, :offset])
|
131
|
+
t.frame_rate(proxy: [:metadata, :video, :frame_rate])
|
132
|
+
end
|
133
|
+
|
134
|
+
def self.xml_template
|
135
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
136
|
+
xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
|
137
|
+
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
138
|
+
'xsi:schemaLocation' =>
|
139
|
+
"http://hul.harvard.edu/ois/xml/ns/fits/fits_output
|
140
|
+
http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
|
141
|
+
version: "0.6.0",
|
142
|
+
timestamp: "1/25/12 11:04 AM") {
|
143
|
+
xml.identification { xml.identity(toolname: 'FITS') }
|
144
|
+
}
|
145
|
+
end
|
146
|
+
builder.doc
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module CurationConcerns
|
2
|
+
# Run FITS to gather technical metadata about the content and the full text.
|
3
|
+
# Store this extracted metadata in the characterization datastream.
|
4
|
+
class CharacterizationService
|
5
|
+
include Hydra::Derivatives::ExtractMetadata
|
6
|
+
|
7
|
+
delegate :mime_type, :uri, to: :@generic_file
|
8
|
+
attr_reader :generic_file
|
9
|
+
|
10
|
+
def self.run(generic_file)
|
11
|
+
new(generic_file).characterize
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(generic_file)
|
15
|
+
@generic_file = generic_file
|
16
|
+
end
|
17
|
+
|
18
|
+
## Extract the metadata from the content datastream and record it in the characterization datastream
|
19
|
+
def characterize
|
20
|
+
store_metadata(extract_metadata)
|
21
|
+
store_fulltext(extract_fulltext)
|
22
|
+
generic_file.filename = [generic_file.original_file.original_name]
|
23
|
+
end
|
24
|
+
|
25
|
+
protected
|
26
|
+
|
27
|
+
def store_fulltext(extracted_text)
|
28
|
+
if extracted_text.present?
|
29
|
+
extracted_text_file = generic_file.build_extracted_text
|
30
|
+
extracted_text_file.content = extracted_text
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def extract_fulltext
|
35
|
+
FullTextExtractionService.run(generic_file)
|
36
|
+
end
|
37
|
+
|
38
|
+
def store_metadata(metadata)
|
39
|
+
generic_file.characterization.ng_xml = metadata if metadata.present?
|
40
|
+
append_metadata
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_metadata
|
44
|
+
return unless generic_file.original_file.has_content?
|
45
|
+
Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
|
46
|
+
config[:fits] = Hydra::Derivatives.fits_path
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
|
51
|
+
def append_metadata
|
52
|
+
terms = generic_file.characterization_terms
|
53
|
+
CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
|
54
|
+
if terms.has_key?(k)
|
55
|
+
# coerce to array to remove a conditional
|
56
|
+
terms[k] = [terms[k]] unless terms[k].is_a? Array
|
57
|
+
terms[k].each do |term_value|
|
58
|
+
proxy_term = generic_file.send(v)
|
59
|
+
if proxy_term.kind_of?(Array)
|
60
|
+
proxy_term << term_value unless proxy_term.include?(term_value)
|
61
|
+
else
|
62
|
+
# these are single-valued terms which cannot be appended to
|
63
|
+
generic_file.send("#{v}=", term_value)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module CurationConcerns
|
2
|
+
# Extract the full text from the content using Solr's extract handler
|
3
|
+
class FullTextExtractionService
|
4
|
+
def self.run(generic_file)
|
5
|
+
new(generic_file).extract
|
6
|
+
end
|
7
|
+
|
8
|
+
delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
|
9
|
+
|
10
|
+
def initialize(generic_file)
|
11
|
+
@generic_file = generic_file
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract
|
15
|
+
uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
|
16
|
+
req = Net::HTTP.new(uri.host, uri.port)
|
17
|
+
resp = req.post(uri.to_s, original_file.content, {
|
18
|
+
'Content-type' => "#{mime_type};charset=utf-8",
|
19
|
+
'Content-Length' => original_file.content.size.to_s
|
20
|
+
})
|
21
|
+
raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
|
22
|
+
original_file.content.rewind if original_file.content.respond_to?(:rewind)
|
23
|
+
JSON.parse(resp.body)[''].rstrip
|
24
|
+
rescue => e
|
25
|
+
logger.error("Error extracting content from #{id}: #{e.inspect}")
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
|
29
|
+
def connection_url
|
30
|
+
case
|
31
|
+
when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
|
32
|
+
when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
|
33
|
+
when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
|
34
|
+
else Blacklight.connection_config[:default]["url"]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|