curation_concerns-models 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.md +177 -0
  5. data/README.md +42 -0
  6. data/Rakefile +1 -0
  7. data/app/actors/concerns/curation_concerns/manages_embargoes_actor.rb +99 -0
  8. data/app/actors/curation_concerns/base_actor.rb +90 -0
  9. data/app/actors/curation_concerns/generic_file_actor.rb +150 -0
  10. data/app/actors/curation_concerns/work_actor_behavior.rb +88 -0
  11. data/app/jobs/active_fedora_id_based_job.rb +22 -0
  12. data/app/jobs/active_fedora_pid_based_job.rb +6 -0
  13. data/app/jobs/audit_job.rb +58 -0
  14. data/app/jobs/characterize_job.rb +11 -0
  15. data/app/jobs/copy_permissions_job.rb +24 -0
  16. data/app/jobs/create_derivatives_job.rb +15 -0
  17. data/app/jobs/import_url_job.rb +56 -0
  18. data/app/jobs/ingest_local_file_job.rb +48 -0
  19. data/app/jobs/resolrize_job.rb +9 -0
  20. data/app/models/checksum_audit_log.rb +21 -0
  21. data/app/models/concerns/curation_concerns/ability.rb +34 -0
  22. data/app/models/concerns/curation_concerns/basic_metadata.rb +87 -0
  23. data/app/models/concerns/curation_concerns/collection_behavior.rb +47 -0
  24. data/app/models/concerns/curation_concerns/generic_file/belongs_to_works.rb +53 -0
  25. data/app/models/concerns/curation_concerns/generic_file/characterization.rb +89 -0
  26. data/app/models/concerns/curation_concerns/generic_file/content.rb +8 -0
  27. data/app/models/concerns/curation_concerns/generic_file/export.rb +343 -0
  28. data/app/models/concerns/curation_concerns/generic_file/full_text_indexing.rb +12 -0
  29. data/app/models/concerns/curation_concerns/generic_file/indexing.rb +14 -0
  30. data/app/models/concerns/curation_concerns/generic_file/versions.rb +16 -0
  31. data/app/models/concerns/curation_concerns/generic_file.rb +5 -0
  32. data/app/models/concerns/curation_concerns/generic_file_behavior.rb +44 -0
  33. data/app/models/concerns/curation_concerns/generic_work_behavior.rb +38 -0
  34. data/app/models/concerns/curation_concerns/has_representative.rb +14 -0
  35. data/app/models/concerns/curation_concerns/human_readable_type.rb +23 -0
  36. data/app/models/concerns/curation_concerns/permissions/readable.rb +19 -0
  37. data/app/models/concerns/curation_concerns/permissions/writable.rb +75 -0
  38. data/app/models/concerns/curation_concerns/permissions.rb +7 -0
  39. data/app/models/concerns/curation_concerns/serializers.rb +15 -0
  40. data/app/models/concerns/curation_concerns/solr_document_behavior.rb +135 -0
  41. data/app/models/concerns/curation_concerns/user.rb +65 -0
  42. data/app/models/concerns/curation_concerns/with_basic_metadata.rb +98 -0
  43. data/app/models/concerns/curation_concerns/with_generic_files.rb +29 -0
  44. data/app/models/curation_concerns/classify_concern.rb +47 -0
  45. data/app/models/curation_concerns/quick_classification_query.rb +31 -0
  46. data/app/models/datastreams/fits_datastream.rb +148 -0
  47. data/app/models/version_committer.rb +2 -0
  48. data/app/services/curation_concerns/characterization_service.rb +71 -0
  49. data/app/services/curation_concerns/full_text_extraction_service.rb +38 -0
  50. data/app/services/curation_concerns/generic_file_audit_service.rb +85 -0
  51. data/app/services/curation_concerns/generic_file_indexing_service.rb +14 -0
  52. data/app/services/curation_concerns/generic_work_indexing_service.rb +16 -0
  53. data/app/services/curation_concerns/noid.rb +23 -0
  54. data/app/services/curation_concerns/repository_audit_service.rb +9 -0
  55. data/app/services/curation_concerns/versioning_service.rb +27 -0
  56. data/config/locales/curation_concerns.en.yml +6 -0
  57. data/curation_concerns-models.gemspec +34 -0
  58. data/lib/curation_concerns/messages.rb +66 -0
  59. data/lib/curation_concerns/models/engine.rb +61 -0
  60. data/lib/curation_concerns/models/resque.rb +36 -0
  61. data/lib/curation_concerns/models/utils.rb +22 -0
  62. data/lib/curation_concerns/models/version.rb +5 -0
  63. data/lib/curation_concerns/models.rb +32 -0
  64. data/lib/generators/curation_concerns/models/abstract_migration_generator.rb +30 -0
  65. data/lib/generators/curation_concerns/models/clamav_generator.rb +19 -0
  66. data/lib/generators/curation_concerns/models/fulltext_generator.rb +28 -0
  67. data/lib/generators/curation_concerns/models/install_generator.rb +70 -0
  68. data/lib/generators/curation_concerns/models/templates/app/models/collection.rb +4 -0
  69. data/lib/generators/curation_concerns/models/templates/app/models/generic_file.rb +4 -0
  70. data/lib/generators/curation_concerns/models/templates/config/clamav.rb +1 -0
  71. data/lib/generators/curation_concerns/models/templates/config/curation_concerns.rb +123 -0
  72. data/lib/generators/curation_concerns/models/templates/config/mime_types.rb +6 -0
  73. data/lib/generators/curation_concerns/models/templates/config/redis.yml +9 -0
  74. data/lib/generators/curation_concerns/models/templates/config/redis_config.rb +32 -0
  75. data/lib/generators/curation_concerns/models/templates/config/resque-pool.yml +1 -0
  76. data/lib/generators/curation_concerns/models/templates/config/resque_admin.rb +10 -0
  77. data/lib/generators/curation_concerns/models/templates/config/resque_config.rb +5 -0
  78. data/lib/generators/curation_concerns/models/templates/migrations/create_checksum_audit_logs.rb +19 -0
  79. data/lib/generators/curation_concerns/models/templates/migrations/create_version_committers.rb +15 -0
  80. data/lib/tasks/curation_concerns-models_tasks.rake +75 -0
  81. data/lib/tasks/migrate.rake +13 -0
  82. data/lib/tasks/resque.rake +13 -0
  83. data/lib/tasks/solr_reindex.rake +8 -0
  84. metadata +282 -0
@@ -0,0 +1,135 @@
1
+ module CurationConcerns
2
+ module SolrDocumentBehavior
3
+ def title_or_label
4
+ title || label
5
+ end
6
+
7
+ ##
8
+ # Give our SolrDocument an ActiveModel::Naming appropriate route_key
9
+ def route_key
10
+ get(Solrizer.solr_name('has_model', :symbol)).split(':').last.downcase
11
+ end
12
+
13
+ def to_param
14
+ id
15
+ end
16
+
17
+ def to_s
18
+ title_or_label
19
+ end
20
+
21
+ ##
22
+ # Offer the source (ActiveFedora-based) model to Rails for some of the
23
+ # Rails methods (e.g. link_to).
24
+ # @example
25
+ # link_to '...', SolrDocument(:id => 'bXXXXXX5').new => <a href="/dams_object/bXXXXXX5">...</a>
26
+ def to_model
27
+ @model ||= begin
28
+ m = ActiveFedora::Base.load_instance_from_solr(id, self)
29
+ m.class == ActiveFedora::Base ? self : m
30
+ end
31
+ end
32
+
33
+ def collection?
34
+ hydra_model == 'Collection'
35
+ end
36
+
37
+ # Method to return the ActiveFedora model
38
+ def hydra_model
39
+ self[Solrizer.solr_name('active_fedora_model', Solrizer::Descriptor.new(:string, :stored, :indexed))]
40
+ end
41
+
42
+ def human_readable_type
43
+ Array(self[Solrizer.solr_name('human_readable_type', :stored_searchable)]).first
44
+ end
45
+
46
+ def representative
47
+ Array(self[Solrizer.solr_name('representative', :stored_searchable)]).first
48
+ end
49
+
50
+ def date_uploaded
51
+ field = self[Solrizer.solr_name("date_uploaded", :stored_sortable, type: :date)]
52
+ return unless field.present?
53
+ begin
54
+ Date.parse(field).to_formatted_s(:standard)
55
+ rescue
56
+ Rails.logger.info "Unable to parse date: #{field.first.inspect} for #{self['id']}"
57
+ end
58
+ end
59
+
60
+ def depositor(default = '')
61
+ val = Array(self[Solrizer.solr_name("depositor")]).first
62
+ val.present? ? val : default
63
+ end
64
+
65
+ def title
66
+ Array(self[Solrizer.solr_name('title')]).first
67
+ end
68
+
69
+ def description
70
+ Array(self[Solrizer.solr_name('description')]).first
71
+ end
72
+
73
+ def label
74
+ Array(self[Solrizer.solr_name('label')]).first
75
+ end
76
+
77
+ def file_format
78
+ Array(self[Solrizer.solr_name('file_format')]).first
79
+ end
80
+
81
+ def creator
82
+ Array(self[Solrizer.solr_name("creator")]).first
83
+ end
84
+
85
+ def tags
86
+ Array(self[Solrizer.solr_name("tag")])
87
+ end
88
+
89
+ def resource_type
90
+ Array(self[Solrizer.solr_name("resource_type")])
91
+ end
92
+
93
+ def mime_type
94
+ Array(self[Solrizer.solr_name("mime_type")]).first
95
+ end
96
+
97
+ def read_groups
98
+ Array(self[::Ability.read_group_field])
99
+ end
100
+
101
+ def edit_groups
102
+ Array(self[::Ability.edit_group_field])
103
+ end
104
+
105
+ def edit_people
106
+ Array(self[::Ability.edit_user_field])
107
+ end
108
+
109
+ def public?
110
+ read_groups.include?('public')
111
+ end
112
+
113
+ def registered?
114
+ read_groups.include?('registered')
115
+ end
116
+
117
+ def pdf?
118
+ ['application/pdf'].include? self.mime_type
119
+ end
120
+
121
+ def image?
122
+ ['image/png','image/jpeg', 'image/jpg', 'image/jp2', 'image/bmp', 'image/gif', 'image/tiff'].include? self.mime_type
123
+ end
124
+
125
+ def video?
126
+ ['video/mpeg', 'video/mp4', 'video/webm', 'video/x-msvideo', 'video/avi', 'video/quicktime', 'application/mxf'].include? self.mime_type
127
+ end
128
+
129
+ def audio?
130
+ # audio/x-wave is the mime type that fits 0.6.0 returns for a wav file.
131
+ # audio/mpeg is the mime type that fits 0.6.0 returns for an mp3 file.
132
+ ['audio/mp3', 'audio/mpeg', 'audio/x-wave', 'audio/x-wav', 'audio/ogg'].include? self.mime_type
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,65 @@
1
+ module CurationConcerns::User
2
+ extend ActiveSupport::Concern
3
+
4
+ # Copied piecemeal from the pcdm branch of sufia-models. More may yet be necessary.
5
+
6
+ included do
7
+ # Connects this user object to Blacklight's Bookmarks and Folders.
8
+ include Blacklight::User
9
+ include Hydra::User
10
+
11
+ delegate :can?, :cannot?, to: :ability
12
+
13
+ attr_accessor :update_directory
14
+ end
15
+
16
+ # Format the json for select2 which requires just an id and a field called text.
17
+ # If we need an alternate format we should probably look at a json template gem
18
+ def as_json(opts = nil)
19
+ { id: user_key, text: display_name ? "#{display_name} (#{user_key})" : user_key }
20
+ end
21
+
22
+ # Populate user instance with attributes from remote system (e.g., LDAP)
23
+ # There is no default implementation -- override this in your application
24
+ # def populate_attributes
25
+ # end
26
+
27
+ def email_address
28
+ self.email
29
+ end
30
+
31
+ def name
32
+ self.display_name.titleize || raise
33
+ rescue
34
+ self.user_key
35
+ end
36
+
37
+ # Redefine this for more intuitive keys in Redis
38
+ def to_param
39
+ # hack because rails doesn't like periods in urls.
40
+ user_key.gsub(/\./, '-dot-')
41
+ end
42
+
43
+ # The basic groups method, override or will fallback to S ufia::Ldap::User
44
+ # def groups
45
+ # @groups ||= self.group_list ? self.group_list.split(";?;") : []
46
+ # end
47
+
48
+ def ability
49
+ @ability ||= ::Ability.new(self)
50
+ end
51
+
52
+ module ClassMethods
53
+ def current
54
+ Thread.current[:user]
55
+ end
56
+
57
+ def current=(user)
58
+ Thread.current[:user] = user
59
+ end
60
+
61
+ # def from_url_component(component)
62
+ # User.find_by_user_key(component.gsub(/-dot-/, '.'))
63
+ # end
64
+ end
65
+ end
@@ -0,0 +1,98 @@
1
+ module CurationConcerns
2
+ # This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
3
+ # * title & description are single-value instead of multivalue
4
+ module DefaultMetadata
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+
9
+ property :label, predicate: ::RDF::DC.title, multiple: false
10
+
11
+ property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
12
+ index.as :symbol, :stored_searchable
13
+ end
14
+
15
+ property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
16
+
17
+ property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
18
+ index.as :symbol
19
+ end
20
+
21
+ property :part_of, predicate: ::RDF::DC.isPartOf
22
+ property :resource_type, predicate: ::RDF::DC.type do |index|
23
+ index.as :stored_searchable, :facetable
24
+ end
25
+ property :title, predicate: ::RDF::DC.title, multiple:false do |index|
26
+ index.as :stored_searchable, :facetable
27
+ end
28
+ property :creator, predicate: ::RDF::DC.creator do |index|
29
+ index.as :stored_searchable, :facetable
30
+ end
31
+ property :contributor, predicate: ::RDF::DC.contributor do |index|
32
+ index.as :stored_searchable, :facetable
33
+ end
34
+ property :description, predicate: ::RDF::DC.description, multiple: false do |index|
35
+ index.type :text
36
+ index.as :stored_searchable
37
+ end
38
+ property :tag, predicate: ::RDF::DC.relation do |index|
39
+ index.as :stored_searchable, :facetable
40
+ end
41
+ property :rights, predicate: ::RDF::DC.rights do |index|
42
+ index.as :stored_searchable
43
+ end
44
+ property :publisher, predicate: ::RDF::DC.publisher do |index|
45
+ index.as :stored_searchable, :facetable
46
+ end
47
+ property :date_created, predicate: ::RDF::DC.created do |index|
48
+ index.as :stored_searchable
49
+ end
50
+ property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
51
+ index.type :date
52
+ index.as :stored_sortable
53
+ end
54
+ property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
55
+ index.type :date
56
+ index.as :stored_sortable
57
+ end
58
+ property :subject, predicate: ::RDF::DC.subject do |index|
59
+ index.as :stored_searchable, :facetable
60
+ end
61
+ property :language, predicate: ::RDF::DC.language do |index|
62
+ index.as :stored_searchable, :facetable
63
+ end
64
+ property :identifier, predicate: ::RDF::DC.identifier do |index|
65
+ index.as :stored_searchable
66
+ end
67
+ property :based_near, predicate: ::RDF::FOAF.based_near do |index|
68
+ index.as :stored_searchable, :facetable
69
+ end
70
+ property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
71
+ index.as :stored_searchable
72
+ end
73
+ property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
74
+ index.as :stored_searchable
75
+ end
76
+ property :source, predicate: ::RDF::DC.source do |index|
77
+ index.as :stored_searchable
78
+ end
79
+
80
+ # TODO: Move this somewhere more appropriate
81
+ begin
82
+ LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
83
+ LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
84
+ LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
85
+ rescue
86
+ puts "tables for vocabularies missing"
87
+ end
88
+ end
89
+
90
+ # Add a schema.org itemtype
91
+ def itemtype
92
+ # Look up the first non-empty resource type value in a hash from the config
93
+ CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
94
+ rescue
95
+ 'http://schema.org/CreativeWork'
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,29 @@
1
+ # Copied from Curate
2
+ module CurationConcerns
3
+ module WithGenericFiles
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ # The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
8
+ before_destroy :before_destroy_cleanup_generic_files
9
+ end
10
+
11
+ # Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
12
+ # At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
13
+ def generic_file_ids
14
+ generic_files.map { |generic_file| generic_file.id }
15
+ end
16
+
17
+ def before_destroy_cleanup_generic_files
18
+ generic_files.each(&:destroy)
19
+ end
20
+
21
+ def copy_visibility_to_files
22
+ generic_files.each do |gf|
23
+ gf.visibility = visibility
24
+ gf.save!
25
+ end
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,47 @@
1
+ require 'active_attr'
2
+ module CurationConcerns
3
+ class ClassifyConcern
4
+ include ActiveAttr::Model
5
+ attribute :curation_concern_type
6
+
7
+ validates(
8
+ :curation_concern_type,
9
+ presence: true,
10
+ inclusion: { in: lambda { |record| record.registered_curation_concern_types } }
11
+ )
12
+
13
+ def all_curation_concern_classes
14
+ registered_curation_concern_types.sort.map { |c| self.class.to_class(c) }
15
+ end
16
+
17
+ def registered_curation_concern_types
18
+ CurationConcerns.configuration.registered_curation_concern_types
19
+ end
20
+
21
+ def possible_curation_concern_types
22
+ registered_curation_concern_types.collect do |concern|
23
+ [self.class.to_class(concern).human_readable_type, concern]
24
+ end
25
+ end
26
+
27
+ def curation_concern_class
28
+ if possible_curation_concern_types.detect{|name, class_name|
29
+ class_name == curation_concern_type
30
+ }
31
+ self.class.to_class(curation_concern_type)
32
+ else
33
+ raise RuntimeError, "Invalid :curation_concern_type"
34
+ end
35
+ end
36
+
37
+ def self.to_class(type)
38
+ # TODO we may want to allow a different (or nil) namespace
39
+ type.camelize.constantize
40
+ # begin
41
+ # "::#{type.camelize}".constantize
42
+ # rescue NameError
43
+ # "CurationConcerns::#{type}".constantize
44
+ # end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,31 @@
1
+ module CurationConcerns
2
+ class QuickClassificationQuery
3
+
4
+ def self.each_for_context(*args, &block)
5
+ new(*args).all.each(&block)
6
+ end
7
+
8
+ attr_reader :user
9
+
10
+ def initialize(user, options = {})
11
+ @user = user
12
+ @concern_name_normalizer = options.fetch(:concern_name_normalizer, ClassifyConcern.method(:to_class))
13
+ @registered_curation_concern_names = options.fetch(:registered_curation_concern_names, CurationConcerns.configuration.registered_curation_concern_types)
14
+ end
15
+
16
+ def all
17
+ ActiveFedora::Base.logger.debug "User is #{user}"
18
+ ActiveFedora::Base.logger.debug "try is #{normalized_curation_concern_names.first}"
19
+ ActiveFedora::Base.logger.debug "can is #{user.can?(:create, normalized_curation_concern_names.first)}"
20
+ normalized_curation_concern_names.select {|klass| user.can?(:create, klass)}
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :concern_name_normalizer, :registered_curation_concern_names
26
+
27
+ def normalized_curation_concern_names
28
+ registered_curation_concern_names.collect{|name| concern_name_normalizer.call(name) }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,148 @@
1
+ class FitsDatastream < ActiveFedora::OmDatastream
2
+ include OM::XML::Document
3
+
4
+ set_terminology do |t|
5
+ t.root(path: "fits",
6
+ xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
7
+ schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
8
+ t.identification {
9
+ t.identity {
10
+ t.format_label(path: {attribute: "format"})
11
+ t.mime_type(path: {attribute: "mimetype"})
12
+ }
13
+ }
14
+ t.fileinfo {
15
+ t.file_size(path: "size")
16
+ t.last_modified(path: "lastmodified")
17
+ t.filename(path: "filename")
18
+ t.original_checksum(path: "md5checksum")
19
+ t.rights_basis(path: "rightsBasis")
20
+ t.copyright_basis(path: "copyrightBasis")
21
+ t.copyright_note(path: "copyrightNote")
22
+ }
23
+ t.filestatus {
24
+ t.well_formed(path: "well-formed")
25
+ t.valid(path: "valid")
26
+ t.status_message(path: "message")
27
+ }
28
+ t.metadata {
29
+ t.document {
30
+ t.file_title(path: "title")
31
+ t.file_author(path: "author")
32
+ t.file_language(path: "language")
33
+ t.page_count(path: "pageCount")
34
+ t.word_count(path: "wordCount")
35
+ t.character_count(path: "characterCount")
36
+ t.paragraph_count(path: "paragraphCount")
37
+ t.line_count(path: "lineCount")
38
+ t.table_count(path: "tableCount")
39
+ t.graphics_count(path: "graphicsCount")
40
+ }
41
+ t.image {
42
+ t.byte_order(path: "byteOrder")
43
+ t.compression(path: "compressionScheme")
44
+ t.width(path: "imageWidth")
45
+ t.height(path: "imageHeight")
46
+ t.color_space(path: "colorSpace")
47
+ t.profile_name(path: "iccProfileName")
48
+ t.profile_version(path: "iccProfileVersion")
49
+ t.orientation(path: "orientation")
50
+ t.color_map(path: "colorMap")
51
+ t.image_producer(path: "imageProducer")
52
+ t.capture_device(path: "captureDevice")
53
+ t.scanning_software(path: "scanningSoftwareName")
54
+ t.exif_version(path: "exifVersion")
55
+ t.gps_timestamp(path: "gpsTimeStamp")
56
+ t.latitude(path: "gpsDestLatitude")
57
+ t.longitude(path: "gpsDestLongitude")
58
+ }
59
+ t.text {
60
+ t.character_set(path: "charset")
61
+ t.markup_basis(path: "markupBasis")
62
+ t.markup_language(path: "markupLanguage")
63
+ }
64
+ t.audio {
65
+ t.duration(path: "duration")
66
+ t.bit_depth(path: "bitDepth")
67
+ t.sample_rate(path: "sampleRate")
68
+ t.channels(path: "channels")
69
+ t.data_format(path: "dataFormatType")
70
+ t.offset(path: "offset")
71
+ }
72
+ t.video {
73
+ t.width(path: "imageWidth")
74
+ t.height(path: "imageHeight")
75
+ t.duration(path: "duration")
76
+ t.sample_rate(path: "sampleRate")
77
+ t.frame_rate(path: "frameRate")
78
+ }
79
+ }
80
+ t.format_label(proxy: [:identification, :identity, :format_label])
81
+ t.mime_type(proxy: [:identification, :identity, :mime_type])
82
+ t.file_size(proxy: [:fileinfo, :file_size])
83
+ t.last_modified(proxy: [:fileinfo, :last_modified])
84
+ t.filename(proxy: [:fileinfo, :filename])
85
+ t.original_checksum(proxy: [:fileinfo, :original_checksum])
86
+ t.rights_basis(proxy: [:fileinfo, :rights_basis])
87
+ t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
88
+ t.copyright_note(proxy: [:fileinfo, :copyright_note])
89
+ t.well_formed(proxy: [:filestatus, :well_formed])
90
+ t.valid(proxy: [:filestatus, :valid])
91
+ t.status_message(proxy: [:filestatus, :status_message])
92
+ t.file_title(proxy: [:metadata, :document, :file_title])
93
+ t.file_author(proxy: [:metadata, :document, :file_author])
94
+ t.page_count(proxy: [:metadata, :document, :page_count])
95
+ t.file_language(proxy: [:metadata, :document, :file_language])
96
+ t.word_count(proxy: [:metadata, :document, :word_count])
97
+ t.character_count(proxy: [:metadata, :document, :character_count])
98
+ t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
99
+ t.line_count(proxy: [:metadata, :document, :line_count])
100
+ t.table_count(proxy: [:metadata, :document, :table_count])
101
+ t.graphics_count(proxy: [:metadata, :document, :graphics_count])
102
+ t.byte_order(proxy: [:metadata, :image, :byte_order])
103
+ t.compression(proxy: [:metadata, :image, :compression])
104
+ t.width(proxy: [:metadata, :image, :width])
105
+ t.video_width( proxy: [:metadata, :video, :width])
106
+ t.height(proxy: [:metadata, :image, :height])
107
+ t.video_height(proxy: [:metadata, :video, :height])
108
+ t.color_space(proxy: [:metadata, :image, :color_space])
109
+ t.profile_name(proxy: [:metadata, :image, :profile_name])
110
+ t.profile_version(proxy: [:metadata, :image, :profile_version])
111
+ t.orientation(proxy: [:metadata, :image, :orientation])
112
+ t.color_map(proxy: [:metadata, :image, :color_map])
113
+ t.image_producer(proxy: [:metadata, :image, :image_producer])
114
+ t.capture_device(proxy: [:metadata, :image, :capture_device])
115
+ t.scanning_software(proxy: [:metadata, :image, :scanning_software])
116
+ t.exif_version(proxy: [:metadata, :image, :exif_version])
117
+ t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
118
+ t.latitude(proxy: [:metadata, :image, :latitude])
119
+ t.longitude(proxy: [:metadata, :image, :longitude])
120
+ t.character_set(proxy: [:metadata, :text, :character_set])
121
+ t.markup_basis(proxy: [:metadata, :text, :markup_basis])
122
+ t.markup_language(proxy: [:metadata, :text, :markup_language])
123
+ t.duration(proxy: [:metadata, :audio, :duration])
124
+ t.video_duration(proxy: [:metadata, :video, :duration])
125
+ t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
126
+ t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
127
+ t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
128
+ t.channels(proxy: [:metadata, :audio, :channels])
129
+ t.data_format(proxy: [:metadata, :audio, :data_format])
130
+ t.offset(proxy: [:metadata, :audio, :offset])
131
+ t.frame_rate(proxy: [:metadata, :video, :frame_rate])
132
+ end
133
+
134
+ def self.xml_template
135
+ builder = Nokogiri::XML::Builder.new do |xml|
136
+ xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
137
+ 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
138
+ 'xsi:schemaLocation' =>
139
+ "http://hul.harvard.edu/ois/xml/ns/fits/fits_output
140
+ http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
141
+ version: "0.6.0",
142
+ timestamp: "1/25/12 11:04 AM") {
143
+ xml.identification { xml.identity(toolname: 'FITS') }
144
+ }
145
+ end
146
+ builder.doc
147
+ end
148
+ end
@@ -0,0 +1,2 @@
1
+ class VersionCommitter < ActiveRecord::Base
2
+ end
@@ -0,0 +1,71 @@
1
+ module CurationConcerns
2
+ # Run FITS to gather technical metadata about the content and the full text.
3
+ # Store this extracted metadata in the characterization datastream.
4
+ class CharacterizationService
5
+ include Hydra::Derivatives::ExtractMetadata
6
+
7
+ delegate :mime_type, :uri, to: :@generic_file
8
+ attr_reader :generic_file
9
+
10
+ def self.run(generic_file)
11
+ new(generic_file).characterize
12
+ end
13
+
14
+ def initialize(generic_file)
15
+ @generic_file = generic_file
16
+ end
17
+
18
+ ## Extract the metadata from the content datastream and record it in the characterization datastream
19
+ def characterize
20
+ store_metadata(extract_metadata)
21
+ store_fulltext(extract_fulltext)
22
+ generic_file.filename = [generic_file.original_file.original_name]
23
+ end
24
+
25
+ protected
26
+
27
+ def store_fulltext(extracted_text)
28
+ if extracted_text.present?
29
+ extracted_text_file = generic_file.build_extracted_text
30
+ extracted_text_file.content = extracted_text
31
+ end
32
+ end
33
+
34
+ def extract_fulltext
35
+ FullTextExtractionService.run(generic_file)
36
+ end
37
+
38
+ def store_metadata(metadata)
39
+ generic_file.characterization.ng_xml = metadata if metadata.present?
40
+ append_metadata
41
+ end
42
+
43
+ def extract_metadata
44
+ return unless generic_file.original_file.has_content?
45
+ Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
46
+ config[:fits] = Hydra::Derivatives.fits_path
47
+ end
48
+ end
49
+
50
+ # Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
51
+ def append_metadata
52
+ terms = generic_file.characterization_terms
53
+ CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
54
+ if terms.has_key?(k)
55
+ # coerce to array to remove a conditional
56
+ terms[k] = [terms[k]] unless terms[k].is_a? Array
57
+ terms[k].each do |term_value|
58
+ proxy_term = generic_file.send(v)
59
+ if proxy_term.kind_of?(Array)
60
+ proxy_term << term_value unless proxy_term.include?(term_value)
61
+ else
62
+ # these are single-valued terms which cannot be appended to
63
+ generic_file.send("#{v}=", term_value)
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,38 @@
1
+ module CurationConcerns
2
+ # Extract the full text from the content using Solr's extract handler
3
+ class FullTextExtractionService
4
+ def self.run(generic_file)
5
+ new(generic_file).extract
6
+ end
7
+
8
+ delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
9
+
10
+ def initialize(generic_file)
11
+ @generic_file = generic_file
12
+ end
13
+
14
+ def extract
15
+ uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
16
+ req = Net::HTTP.new(uri.host, uri.port)
17
+ resp = req.post(uri.to_s, original_file.content, {
18
+ 'Content-type' => "#{mime_type};charset=utf-8",
19
+ 'Content-Length' => original_file.content.size.to_s
20
+ })
21
+ raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
22
+ original_file.content.rewind if original_file.content.respond_to?(:rewind)
23
+ JSON.parse(resp.body)[''].rstrip
24
+ rescue => e
25
+ logger.error("Error extracting content from #{id}: #{e.inspect}")
26
+ return nil
27
+ end
28
+
29
+ def connection_url
30
+ case
31
+ when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
32
+ when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
33
+ when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
34
+ else Blacklight.connection_config[:default]["url"]
35
+ end
36
+ end
37
+ end
38
+ end