curation_concerns-models 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.md +177 -0
  5. data/README.md +42 -0
  6. data/Rakefile +1 -0
  7. data/app/actors/concerns/curation_concerns/manages_embargoes_actor.rb +99 -0
  8. data/app/actors/curation_concerns/base_actor.rb +90 -0
  9. data/app/actors/curation_concerns/generic_file_actor.rb +150 -0
  10. data/app/actors/curation_concerns/work_actor_behavior.rb +88 -0
  11. data/app/jobs/active_fedora_id_based_job.rb +22 -0
  12. data/app/jobs/active_fedora_pid_based_job.rb +6 -0
  13. data/app/jobs/audit_job.rb +58 -0
  14. data/app/jobs/characterize_job.rb +11 -0
  15. data/app/jobs/copy_permissions_job.rb +24 -0
  16. data/app/jobs/create_derivatives_job.rb +15 -0
  17. data/app/jobs/import_url_job.rb +56 -0
  18. data/app/jobs/ingest_local_file_job.rb +48 -0
  19. data/app/jobs/resolrize_job.rb +9 -0
  20. data/app/models/checksum_audit_log.rb +21 -0
  21. data/app/models/concerns/curation_concerns/ability.rb +34 -0
  22. data/app/models/concerns/curation_concerns/basic_metadata.rb +87 -0
  23. data/app/models/concerns/curation_concerns/collection_behavior.rb +47 -0
  24. data/app/models/concerns/curation_concerns/generic_file/belongs_to_works.rb +53 -0
  25. data/app/models/concerns/curation_concerns/generic_file/characterization.rb +89 -0
  26. data/app/models/concerns/curation_concerns/generic_file/content.rb +8 -0
  27. data/app/models/concerns/curation_concerns/generic_file/export.rb +343 -0
  28. data/app/models/concerns/curation_concerns/generic_file/full_text_indexing.rb +12 -0
  29. data/app/models/concerns/curation_concerns/generic_file/indexing.rb +14 -0
  30. data/app/models/concerns/curation_concerns/generic_file/versions.rb +16 -0
  31. data/app/models/concerns/curation_concerns/generic_file.rb +5 -0
  32. data/app/models/concerns/curation_concerns/generic_file_behavior.rb +44 -0
  33. data/app/models/concerns/curation_concerns/generic_work_behavior.rb +38 -0
  34. data/app/models/concerns/curation_concerns/has_representative.rb +14 -0
  35. data/app/models/concerns/curation_concerns/human_readable_type.rb +23 -0
  36. data/app/models/concerns/curation_concerns/permissions/readable.rb +19 -0
  37. data/app/models/concerns/curation_concerns/permissions/writable.rb +75 -0
  38. data/app/models/concerns/curation_concerns/permissions.rb +7 -0
  39. data/app/models/concerns/curation_concerns/serializers.rb +15 -0
  40. data/app/models/concerns/curation_concerns/solr_document_behavior.rb +135 -0
  41. data/app/models/concerns/curation_concerns/user.rb +65 -0
  42. data/app/models/concerns/curation_concerns/with_basic_metadata.rb +98 -0
  43. data/app/models/concerns/curation_concerns/with_generic_files.rb +29 -0
  44. data/app/models/curation_concerns/classify_concern.rb +47 -0
  45. data/app/models/curation_concerns/quick_classification_query.rb +31 -0
  46. data/app/models/datastreams/fits_datastream.rb +148 -0
  47. data/app/models/version_committer.rb +2 -0
  48. data/app/services/curation_concerns/characterization_service.rb +71 -0
  49. data/app/services/curation_concerns/full_text_extraction_service.rb +38 -0
  50. data/app/services/curation_concerns/generic_file_audit_service.rb +85 -0
  51. data/app/services/curation_concerns/generic_file_indexing_service.rb +14 -0
  52. data/app/services/curation_concerns/generic_work_indexing_service.rb +16 -0
  53. data/app/services/curation_concerns/noid.rb +23 -0
  54. data/app/services/curation_concerns/repository_audit_service.rb +9 -0
  55. data/app/services/curation_concerns/versioning_service.rb +27 -0
  56. data/config/locales/curation_concerns.en.yml +6 -0
  57. data/curation_concerns-models.gemspec +34 -0
  58. data/lib/curation_concerns/messages.rb +66 -0
  59. data/lib/curation_concerns/models/engine.rb +61 -0
  60. data/lib/curation_concerns/models/resque.rb +36 -0
  61. data/lib/curation_concerns/models/utils.rb +22 -0
  62. data/lib/curation_concerns/models/version.rb +5 -0
  63. data/lib/curation_concerns/models.rb +32 -0
  64. data/lib/generators/curation_concerns/models/abstract_migration_generator.rb +30 -0
  65. data/lib/generators/curation_concerns/models/clamav_generator.rb +19 -0
  66. data/lib/generators/curation_concerns/models/fulltext_generator.rb +28 -0
  67. data/lib/generators/curation_concerns/models/install_generator.rb +70 -0
  68. data/lib/generators/curation_concerns/models/templates/app/models/collection.rb +4 -0
  69. data/lib/generators/curation_concerns/models/templates/app/models/generic_file.rb +4 -0
  70. data/lib/generators/curation_concerns/models/templates/config/clamav.rb +1 -0
  71. data/lib/generators/curation_concerns/models/templates/config/curation_concerns.rb +123 -0
  72. data/lib/generators/curation_concerns/models/templates/config/mime_types.rb +6 -0
  73. data/lib/generators/curation_concerns/models/templates/config/redis.yml +9 -0
  74. data/lib/generators/curation_concerns/models/templates/config/redis_config.rb +32 -0
  75. data/lib/generators/curation_concerns/models/templates/config/resque-pool.yml +1 -0
  76. data/lib/generators/curation_concerns/models/templates/config/resque_admin.rb +10 -0
  77. data/lib/generators/curation_concerns/models/templates/config/resque_config.rb +5 -0
  78. data/lib/generators/curation_concerns/models/templates/migrations/create_checksum_audit_logs.rb +19 -0
  79. data/lib/generators/curation_concerns/models/templates/migrations/create_version_committers.rb +15 -0
  80. data/lib/tasks/curation_concerns-models_tasks.rake +75 -0
  81. data/lib/tasks/migrate.rake +13 -0
  82. data/lib/tasks/resque.rake +13 -0
  83. data/lib/tasks/solr_reindex.rake +8 -0
  84. metadata +282 -0
@@ -0,0 +1,135 @@
1
+ module CurationConcerns
2
+ module SolrDocumentBehavior
3
+ def title_or_label
4
+ title || label
5
+ end
6
+
7
+ ##
8
+ # Give our SolrDocument an ActiveModel::Naming appropriate route_key
9
+ def route_key
10
+ get(Solrizer.solr_name('has_model', :symbol)).split(':').last.downcase
11
+ end
12
+
13
+ def to_param
14
+ id
15
+ end
16
+
17
+ def to_s
18
+ title_or_label
19
+ end
20
+
21
+ ##
22
+ # Offer the source (ActiveFedora-based) model to Rails for some of the
23
+ # Rails methods (e.g. link_to).
24
+ # @example
25
+ # link_to '...', SolrDocument(:id => 'bXXXXXX5').new => <a href="/dams_object/bXXXXXX5">...</a>
26
+ def to_model
27
+ @model ||= begin
28
+ m = ActiveFedora::Base.load_instance_from_solr(id, self)
29
+ m.class == ActiveFedora::Base ? self : m
30
+ end
31
+ end
32
+
33
+ def collection?
34
+ hydra_model == 'Collection'
35
+ end
36
+
37
+ # Method to return the ActiveFedora model
38
+ def hydra_model
39
+ self[Solrizer.solr_name('active_fedora_model', Solrizer::Descriptor.new(:string, :stored, :indexed))]
40
+ end
41
+
42
+ def human_readable_type
43
+ Array(self[Solrizer.solr_name('human_readable_type', :stored_searchable)]).first
44
+ end
45
+
46
+ def representative
47
+ Array(self[Solrizer.solr_name('representative', :stored_searchable)]).first
48
+ end
49
+
50
+ def date_uploaded
51
+ field = self[Solrizer.solr_name("date_uploaded", :stored_sortable, type: :date)]
52
+ return unless field.present?
53
+ begin
54
+ Date.parse(field).to_formatted_s(:standard)
55
+ rescue
56
+ Rails.logger.info "Unable to parse date: #{field.first.inspect} for #{self['id']}"
57
+ end
58
+ end
59
+
60
+ def depositor(default = '')
61
+ val = Array(self[Solrizer.solr_name("depositor")]).first
62
+ val.present? ? val : default
63
+ end
64
+
65
+ def title
66
+ Array(self[Solrizer.solr_name('title')]).first
67
+ end
68
+
69
+ def description
70
+ Array(self[Solrizer.solr_name('description')]).first
71
+ end
72
+
73
+ def label
74
+ Array(self[Solrizer.solr_name('label')]).first
75
+ end
76
+
77
+ def file_format
78
+ Array(self[Solrizer.solr_name('file_format')]).first
79
+ end
80
+
81
+ def creator
82
+ Array(self[Solrizer.solr_name("creator")]).first
83
+ end
84
+
85
+ def tags
86
+ Array(self[Solrizer.solr_name("tag")])
87
+ end
88
+
89
+ def resource_type
90
+ Array(self[Solrizer.solr_name("resource_type")])
91
+ end
92
+
93
+ def mime_type
94
+ Array(self[Solrizer.solr_name("mime_type")]).first
95
+ end
96
+
97
+ def read_groups
98
+ Array(self[::Ability.read_group_field])
99
+ end
100
+
101
+ def edit_groups
102
+ Array(self[::Ability.edit_group_field])
103
+ end
104
+
105
+ def edit_people
106
+ Array(self[::Ability.edit_user_field])
107
+ end
108
+
109
+ def public?
110
+ read_groups.include?('public')
111
+ end
112
+
113
+ def registered?
114
+ read_groups.include?('registered')
115
+ end
116
+
117
+ def pdf?
118
+ ['application/pdf'].include? self.mime_type
119
+ end
120
+
121
+ def image?
122
+ ['image/png','image/jpeg', 'image/jpg', 'image/jp2', 'image/bmp', 'image/gif', 'image/tiff'].include? self.mime_type
123
+ end
124
+
125
+ def video?
126
+ ['video/mpeg', 'video/mp4', 'video/webm', 'video/x-msvideo', 'video/avi', 'video/quicktime', 'application/mxf'].include? self.mime_type
127
+ end
128
+
129
+ def audio?
130
+ # audio/x-wave is the mime type that fits 0.6.0 returns for a wav file.
131
+ # audio/mpeg is the mime type that fits 0.6.0 returns for an mp3 file.
132
+ ['audio/mp3', 'audio/mpeg', 'audio/x-wave', 'audio/x-wav', 'audio/ogg'].include? self.mime_type
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,65 @@
1
+ module CurationConcerns::User
2
+ extend ActiveSupport::Concern
3
+
4
+ # Copied piecemeal from the pcdm branch of sufia-models. More may yet be necessary.
5
+
6
+ included do
7
+ # Connects this user object to Blacklight's Bookmarks and Folders.
8
+ include Blacklight::User
9
+ include Hydra::User
10
+
11
+ delegate :can?, :cannot?, to: :ability
12
+
13
+ attr_accessor :update_directory
14
+ end
15
+
16
+ # Format the json for select2 which requires just an id and a field called text.
17
+ # If we need an alternate format we should probably look at a json template gem
18
+ def as_json(opts = nil)
19
+ { id: user_key, text: display_name ? "#{display_name} (#{user_key})" : user_key }
20
+ end
21
+
22
+ # Populate user instance with attributes from remote system (e.g., LDAP)
23
+ # There is no default implementation -- override this in your application
24
+ # def populate_attributes
25
+ # end
26
+
27
+ def email_address
28
+ self.email
29
+ end
30
+
31
+ def name
32
+ self.display_name.titleize || raise
33
+ rescue
34
+ self.user_key
35
+ end
36
+
37
+ # Redefine this for more intuitive keys in Redis
38
+ def to_param
39
+ # hack because rails doesn't like periods in urls.
40
+ user_key.gsub(/\./, '-dot-')
41
+ end
42
+
43
+ # The basic groups method, override or will fallback to S ufia::Ldap::User
44
+ # def groups
45
+ # @groups ||= self.group_list ? self.group_list.split(";?;") : []
46
+ # end
47
+
48
+ def ability
49
+ @ability ||= ::Ability.new(self)
50
+ end
51
+
52
+ module ClassMethods
53
+ def current
54
+ Thread.current[:user]
55
+ end
56
+
57
+ def current=(user)
58
+ Thread.current[:user] = user
59
+ end
60
+
61
+ # def from_url_component(component)
62
+ # User.find_by_user_key(component.gsub(/-dot-/, '.'))
63
+ # end
64
+ end
65
+ end
@@ -0,0 +1,98 @@
1
+ module CurationConcerns
2
+ # This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
3
+ # * title & description are single-value instead of multivalue
4
+ module DefaultMetadata
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+
9
+ property :label, predicate: ::RDF::DC.title, multiple: false
10
+
11
+ property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
12
+ index.as :symbol, :stored_searchable
13
+ end
14
+
15
+ property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
16
+
17
+ property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
18
+ index.as :symbol
19
+ end
20
+
21
+ property :part_of, predicate: ::RDF::DC.isPartOf
22
+ property :resource_type, predicate: ::RDF::DC.type do |index|
23
+ index.as :stored_searchable, :facetable
24
+ end
25
+ property :title, predicate: ::RDF::DC.title, multiple:false do |index|
26
+ index.as :stored_searchable, :facetable
27
+ end
28
+ property :creator, predicate: ::RDF::DC.creator do |index|
29
+ index.as :stored_searchable, :facetable
30
+ end
31
+ property :contributor, predicate: ::RDF::DC.contributor do |index|
32
+ index.as :stored_searchable, :facetable
33
+ end
34
+ property :description, predicate: ::RDF::DC.description, multiple: false do |index|
35
+ index.type :text
36
+ index.as :stored_searchable
37
+ end
38
+ property :tag, predicate: ::RDF::DC.relation do |index|
39
+ index.as :stored_searchable, :facetable
40
+ end
41
+ property :rights, predicate: ::RDF::DC.rights do |index|
42
+ index.as :stored_searchable
43
+ end
44
+ property :publisher, predicate: ::RDF::DC.publisher do |index|
45
+ index.as :stored_searchable, :facetable
46
+ end
47
+ property :date_created, predicate: ::RDF::DC.created do |index|
48
+ index.as :stored_searchable
49
+ end
50
+ property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
51
+ index.type :date
52
+ index.as :stored_sortable
53
+ end
54
+ property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
55
+ index.type :date
56
+ index.as :stored_sortable
57
+ end
58
+ property :subject, predicate: ::RDF::DC.subject do |index|
59
+ index.as :stored_searchable, :facetable
60
+ end
61
+ property :language, predicate: ::RDF::DC.language do |index|
62
+ index.as :stored_searchable, :facetable
63
+ end
64
+ property :identifier, predicate: ::RDF::DC.identifier do |index|
65
+ index.as :stored_searchable
66
+ end
67
+ property :based_near, predicate: ::RDF::FOAF.based_near do |index|
68
+ index.as :stored_searchable, :facetable
69
+ end
70
+ property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
71
+ index.as :stored_searchable
72
+ end
73
+ property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
74
+ index.as :stored_searchable
75
+ end
76
+ property :source, predicate: ::RDF::DC.source do |index|
77
+ index.as :stored_searchable
78
+ end
79
+
80
+ # TODO: Move this somewhere more appropriate
81
+ begin
82
+ LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
83
+ LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
84
+ LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
85
+ rescue
86
+ puts "tables for vocabularies missing"
87
+ end
88
+ end
89
+
90
+ # Add a schema.org itemtype
91
+ def itemtype
92
+ # Look up the first non-empty resource type value in a hash from the config
93
+ CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
94
+ rescue
95
+ 'http://schema.org/CreativeWork'
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,29 @@
1
+ # Copied from Curate
2
+ module CurationConcerns
3
+ module WithGenericFiles
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ # The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
8
+ before_destroy :before_destroy_cleanup_generic_files
9
+ end
10
+
11
+ # Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
12
+ # At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
13
+ def generic_file_ids
14
+ generic_files.map { |generic_file| generic_file.id }
15
+ end
16
+
17
+ def before_destroy_cleanup_generic_files
18
+ generic_files.each(&:destroy)
19
+ end
20
+
21
+ def copy_visibility_to_files
22
+ generic_files.each do |gf|
23
+ gf.visibility = visibility
24
+ gf.save!
25
+ end
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,47 @@
1
+ require 'active_attr'
2
+ module CurationConcerns
3
+ class ClassifyConcern
4
+ include ActiveAttr::Model
5
+ attribute :curation_concern_type
6
+
7
+ validates(
8
+ :curation_concern_type,
9
+ presence: true,
10
+ inclusion: { in: lambda { |record| record.registered_curation_concern_types } }
11
+ )
12
+
13
+ def all_curation_concern_classes
14
+ registered_curation_concern_types.sort.map { |c| self.class.to_class(c) }
15
+ end
16
+
17
+ def registered_curation_concern_types
18
+ CurationConcerns.configuration.registered_curation_concern_types
19
+ end
20
+
21
+ def possible_curation_concern_types
22
+ registered_curation_concern_types.collect do |concern|
23
+ [self.class.to_class(concern).human_readable_type, concern]
24
+ end
25
+ end
26
+
27
+ def curation_concern_class
28
+ if possible_curation_concern_types.detect{|name, class_name|
29
+ class_name == curation_concern_type
30
+ }
31
+ self.class.to_class(curation_concern_type)
32
+ else
33
+ raise RuntimeError, "Invalid :curation_concern_type"
34
+ end
35
+ end
36
+
37
+ def self.to_class(type)
38
+ # TODO we may want to allow a different (or nil) namespace
39
+ type.camelize.constantize
40
+ # begin
41
+ # "::#{type.camelize}".constantize
42
+ # rescue NameError
43
+ # "CurationConcerns::#{type}".constantize
44
+ # end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,31 @@
1
+ module CurationConcerns
2
+ class QuickClassificationQuery
3
+
4
+ def self.each_for_context(*args, &block)
5
+ new(*args).all.each(&block)
6
+ end
7
+
8
+ attr_reader :user
9
+
10
+ def initialize(user, options = {})
11
+ @user = user
12
+ @concern_name_normalizer = options.fetch(:concern_name_normalizer, ClassifyConcern.method(:to_class))
13
+ @registered_curation_concern_names = options.fetch(:registered_curation_concern_names, CurationConcerns.configuration.registered_curation_concern_types)
14
+ end
15
+
16
+ def all
17
+ ActiveFedora::Base.logger.debug "User is #{user}"
18
+ ActiveFedora::Base.logger.debug "try is #{normalized_curation_concern_names.first}"
19
+ ActiveFedora::Base.logger.debug "can is #{user.can?(:create, normalized_curation_concern_names.first)}"
20
+ normalized_curation_concern_names.select {|klass| user.can?(:create, klass)}
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :concern_name_normalizer, :registered_curation_concern_names
26
+
27
+ def normalized_curation_concern_names
28
+ registered_curation_concern_names.collect{|name| concern_name_normalizer.call(name) }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,148 @@
1
+ class FitsDatastream < ActiveFedora::OmDatastream
2
+ include OM::XML::Document
3
+
4
+ set_terminology do |t|
5
+ t.root(path: "fits",
6
+ xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
7
+ schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
8
+ t.identification {
9
+ t.identity {
10
+ t.format_label(path: {attribute: "format"})
11
+ t.mime_type(path: {attribute: "mimetype"})
12
+ }
13
+ }
14
+ t.fileinfo {
15
+ t.file_size(path: "size")
16
+ t.last_modified(path: "lastmodified")
17
+ t.filename(path: "filename")
18
+ t.original_checksum(path: "md5checksum")
19
+ t.rights_basis(path: "rightsBasis")
20
+ t.copyright_basis(path: "copyrightBasis")
21
+ t.copyright_note(path: "copyrightNote")
22
+ }
23
+ t.filestatus {
24
+ t.well_formed(path: "well-formed")
25
+ t.valid(path: "valid")
26
+ t.status_message(path: "message")
27
+ }
28
+ t.metadata {
29
+ t.document {
30
+ t.file_title(path: "title")
31
+ t.file_author(path: "author")
32
+ t.file_language(path: "language")
33
+ t.page_count(path: "pageCount")
34
+ t.word_count(path: "wordCount")
35
+ t.character_count(path: "characterCount")
36
+ t.paragraph_count(path: "paragraphCount")
37
+ t.line_count(path: "lineCount")
38
+ t.table_count(path: "tableCount")
39
+ t.graphics_count(path: "graphicsCount")
40
+ }
41
+ t.image {
42
+ t.byte_order(path: "byteOrder")
43
+ t.compression(path: "compressionScheme")
44
+ t.width(path: "imageWidth")
45
+ t.height(path: "imageHeight")
46
+ t.color_space(path: "colorSpace")
47
+ t.profile_name(path: "iccProfileName")
48
+ t.profile_version(path: "iccProfileVersion")
49
+ t.orientation(path: "orientation")
50
+ t.color_map(path: "colorMap")
51
+ t.image_producer(path: "imageProducer")
52
+ t.capture_device(path: "captureDevice")
53
+ t.scanning_software(path: "scanningSoftwareName")
54
+ t.exif_version(path: "exifVersion")
55
+ t.gps_timestamp(path: "gpsTimeStamp")
56
+ t.latitude(path: "gpsDestLatitude")
57
+ t.longitude(path: "gpsDestLongitude")
58
+ }
59
+ t.text {
60
+ t.character_set(path: "charset")
61
+ t.markup_basis(path: "markupBasis")
62
+ t.markup_language(path: "markupLanguage")
63
+ }
64
+ t.audio {
65
+ t.duration(path: "duration")
66
+ t.bit_depth(path: "bitDepth")
67
+ t.sample_rate(path: "sampleRate")
68
+ t.channels(path: "channels")
69
+ t.data_format(path: "dataFormatType")
70
+ t.offset(path: "offset")
71
+ }
72
+ t.video {
73
+ t.width(path: "imageWidth")
74
+ t.height(path: "imageHeight")
75
+ t.duration(path: "duration")
76
+ t.sample_rate(path: "sampleRate")
77
+ t.frame_rate(path: "frameRate")
78
+ }
79
+ }
80
+ t.format_label(proxy: [:identification, :identity, :format_label])
81
+ t.mime_type(proxy: [:identification, :identity, :mime_type])
82
+ t.file_size(proxy: [:fileinfo, :file_size])
83
+ t.last_modified(proxy: [:fileinfo, :last_modified])
84
+ t.filename(proxy: [:fileinfo, :filename])
85
+ t.original_checksum(proxy: [:fileinfo, :original_checksum])
86
+ t.rights_basis(proxy: [:fileinfo, :rights_basis])
87
+ t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
88
+ t.copyright_note(proxy: [:fileinfo, :copyright_note])
89
+ t.well_formed(proxy: [:filestatus, :well_formed])
90
+ t.valid(proxy: [:filestatus, :valid])
91
+ t.status_message(proxy: [:filestatus, :status_message])
92
+ t.file_title(proxy: [:metadata, :document, :file_title])
93
+ t.file_author(proxy: [:metadata, :document, :file_author])
94
+ t.page_count(proxy: [:metadata, :document, :page_count])
95
+ t.file_language(proxy: [:metadata, :document, :file_language])
96
+ t.word_count(proxy: [:metadata, :document, :word_count])
97
+ t.character_count(proxy: [:metadata, :document, :character_count])
98
+ t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
99
+ t.line_count(proxy: [:metadata, :document, :line_count])
100
+ t.table_count(proxy: [:metadata, :document, :table_count])
101
+ t.graphics_count(proxy: [:metadata, :document, :graphics_count])
102
+ t.byte_order(proxy: [:metadata, :image, :byte_order])
103
+ t.compression(proxy: [:metadata, :image, :compression])
104
+ t.width(proxy: [:metadata, :image, :width])
105
+ t.video_width( proxy: [:metadata, :video, :width])
106
+ t.height(proxy: [:metadata, :image, :height])
107
+ t.video_height(proxy: [:metadata, :video, :height])
108
+ t.color_space(proxy: [:metadata, :image, :color_space])
109
+ t.profile_name(proxy: [:metadata, :image, :profile_name])
110
+ t.profile_version(proxy: [:metadata, :image, :profile_version])
111
+ t.orientation(proxy: [:metadata, :image, :orientation])
112
+ t.color_map(proxy: [:metadata, :image, :color_map])
113
+ t.image_producer(proxy: [:metadata, :image, :image_producer])
114
+ t.capture_device(proxy: [:metadata, :image, :capture_device])
115
+ t.scanning_software(proxy: [:metadata, :image, :scanning_software])
116
+ t.exif_version(proxy: [:metadata, :image, :exif_version])
117
+ t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
118
+ t.latitude(proxy: [:metadata, :image, :latitude])
119
+ t.longitude(proxy: [:metadata, :image, :longitude])
120
+ t.character_set(proxy: [:metadata, :text, :character_set])
121
+ t.markup_basis(proxy: [:metadata, :text, :markup_basis])
122
+ t.markup_language(proxy: [:metadata, :text, :markup_language])
123
+ t.duration(proxy: [:metadata, :audio, :duration])
124
+ t.video_duration(proxy: [:metadata, :video, :duration])
125
+ t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
126
+ t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
127
+ t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
128
+ t.channels(proxy: [:metadata, :audio, :channels])
129
+ t.data_format(proxy: [:metadata, :audio, :data_format])
130
+ t.offset(proxy: [:metadata, :audio, :offset])
131
+ t.frame_rate(proxy: [:metadata, :video, :frame_rate])
132
+ end
133
+
134
+ def self.xml_template
135
+ builder = Nokogiri::XML::Builder.new do |xml|
136
+ xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
137
+ 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
138
+ 'xsi:schemaLocation' =>
139
+ "http://hul.harvard.edu/ois/xml/ns/fits/fits_output
140
+ http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
141
+ version: "0.6.0",
142
+ timestamp: "1/25/12 11:04 AM") {
143
+ xml.identification { xml.identity(toolname: 'FITS') }
144
+ }
145
+ end
146
+ builder.doc
147
+ end
148
+ end
@@ -0,0 +1,2 @@
1
+ class VersionCommitter < ActiveRecord::Base
2
+ end
@@ -0,0 +1,71 @@
1
+ module CurationConcerns
2
+ # Run FITS to gather technical metadata about the content and the full text.
3
+ # Store this extracted metadata in the characterization datastream.
4
+ class CharacterizationService
5
+ include Hydra::Derivatives::ExtractMetadata
6
+
7
+ delegate :mime_type, :uri, to: :@generic_file
8
+ attr_reader :generic_file
9
+
10
+ def self.run(generic_file)
11
+ new(generic_file).characterize
12
+ end
13
+
14
+ def initialize(generic_file)
15
+ @generic_file = generic_file
16
+ end
17
+
18
+ ## Extract the metadata from the content datastream and record it in the characterization datastream
19
+ def characterize
20
+ store_metadata(extract_metadata)
21
+ store_fulltext(extract_fulltext)
22
+ generic_file.filename = [generic_file.original_file.original_name]
23
+ end
24
+
25
+ protected
26
+
27
+ def store_fulltext(extracted_text)
28
+ if extracted_text.present?
29
+ extracted_text_file = generic_file.build_extracted_text
30
+ extracted_text_file.content = extracted_text
31
+ end
32
+ end
33
+
34
+ def extract_fulltext
35
+ FullTextExtractionService.run(generic_file)
36
+ end
37
+
38
+ def store_metadata(metadata)
39
+ generic_file.characterization.ng_xml = metadata if metadata.present?
40
+ append_metadata
41
+ end
42
+
43
+ def extract_metadata
44
+ return unless generic_file.original_file.has_content?
45
+ Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
46
+ config[:fits] = Hydra::Derivatives.fits_path
47
+ end
48
+ end
49
+
50
+ # Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
51
+ def append_metadata
52
+ terms = generic_file.characterization_terms
53
+ CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
54
+ if terms.has_key?(k)
55
+ # coerce to array to remove a conditional
56
+ terms[k] = [terms[k]] unless terms[k].is_a? Array
57
+ terms[k].each do |term_value|
58
+ proxy_term = generic_file.send(v)
59
+ if proxy_term.kind_of?(Array)
60
+ proxy_term << term_value unless proxy_term.include?(term_value)
61
+ else
62
+ # these are single-valued terms which cannot be appended to
63
+ generic_file.send("#{v}=", term_value)
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,38 @@
1
+ module CurationConcerns
2
+ # Extract the full text from the content using Solr's extract handler
3
+ class FullTextExtractionService
4
+ def self.run(generic_file)
5
+ new(generic_file).extract
6
+ end
7
+
8
+ delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
9
+
10
+ def initialize(generic_file)
11
+ @generic_file = generic_file
12
+ end
13
+
14
+ def extract
15
+ uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
16
+ req = Net::HTTP.new(uri.host, uri.port)
17
+ resp = req.post(uri.to_s, original_file.content, {
18
+ 'Content-type' => "#{mime_type};charset=utf-8",
19
+ 'Content-Length' => original_file.content.size.to_s
20
+ })
21
+ raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
22
+ original_file.content.rewind if original_file.content.respond_to?(:rewind)
23
+ JSON.parse(resp.body)[''].rstrip
24
+ rescue => e
25
+ logger.error("Error extracting content from #{id}: #{e.inspect}")
26
+ return nil
27
+ end
28
+
29
+ def connection_url
30
+ case
31
+ when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
32
+ when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
33
+ when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
34
+ else Blacklight.connection_config[:default]["url"]
35
+ end
36
+ end
37
+ end
38
+ end