cul_scv_hydra 0.22.6 → 0.22.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/concerns/cul/hydra/controller.rb +22 -0
  3. data/app/controllers/concerns/cul/hydra/resolver.rb +69 -0
  4. data/app/controllers/concerns/cul/hydra/thumbnails.rb +62 -0
  5. data/app/controllers/concerns/cul/scv/hydra/controller.rb +3 -19
  6. data/app/controllers/concerns/cul/scv/hydra/resolver.rb +2 -65
  7. data/app/controllers/concerns/cul/scv/hydra/thumbnails.rb +3 -59
  8. data/app/models/concept.rb +1 -1
  9. data/app/models/concerns/cul/hydra/models.rb +24 -0
  10. data/app/models/concerns/cul/hydra/models/aggregator.rb +121 -0
  11. data/app/models/concerns/cul/hydra/models/common.rb +220 -0
  12. data/app/models/concerns/cul/hydra/models/image_resource.rb +78 -0
  13. data/app/models/concerns/cul/hydra/models/linkable_resources.rb +108 -0
  14. data/app/models/concerns/cul/hydra/models/resource.rb +87 -0
  15. data/app/models/concerns/cul/scv/hydra/models.rb +1 -13
  16. data/app/models/concerns/cul/scv/hydra/models/aggregator.rb +1 -116
  17. data/app/models/concerns/cul/scv/hydra/models/common.rb +1 -213
  18. data/app/models/concerns/cul/scv/hydra/models/image_resource.rb +3 -75
  19. data/app/models/concerns/cul/scv/hydra/models/linkable_resources.rb +3 -105
  20. data/app/models/concerns/cul/scv/hydra/models/resource.rb +2 -83
  21. data/app/models/cul/hydra/datastreams/dc_metadata.rb +107 -0
  22. data/app/models/cul/hydra/datastreams/mods_document.rb +195 -0
  23. data/app/models/cul/hydra/datastreams/struct_metadata.rb +176 -0
  24. data/app/models/cul/scv/hydra/datastreams/dc_metadata.rb +5 -104
  25. data/app/models/cul/scv/hydra/datastreams/mods_document.rb +5 -178
  26. data/app/models/cul/scv/hydra/datastreams/struct_metadata.rb +5 -174
  27. data/app/models/dc_document.rb +1 -1
  28. data/app/models/generic_aggregator.rb +5 -5
  29. data/app/models/generic_object.rb +2 -2
  30. data/app/models/generic_resource.rb +4 -4
  31. data/app/models/mets_structured_aggregator.rb +2 -2
  32. data/app/models/resource.rb +3 -3
  33. data/app/models/resource_aggregator.rb +3 -3
  34. data/fixtures/spec/CUL_MODS/mods-subjects.xml +24 -0
  35. data/lib/cul_hydra.rb +18 -0
  36. data/lib/cul_hydra/access_controls_enforcement.rb +53 -0
  37. data/lib/cul_hydra/controllers.rb +13 -0
  38. data/lib/cul_hydra/controllers/aggregates.rb +93 -0
  39. data/lib/cul_hydra/controllers/aggregator_controller_helper.rb +27 -0
  40. data/lib/cul_hydra/controllers/catalog.rb +12 -0
  41. data/lib/cul_hydra/controllers/content_aggregators.rb +81 -0
  42. data/lib/cul_hydra/controllers/datastreams.rb +145 -0
  43. data/lib/cul_hydra/controllers/helpers.rb +10 -0
  44. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/active_fedora_helper_behavior.rb +1 -1
  45. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/application_helper_behavior.rb +1 -1
  46. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/dc_metadata_helper_behavior.rb +1 -1
  47. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/hydra_assets_helper_behavior.rb +1 -1
  48. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/hydra_autocomplete_helper_behavior.rb +1 -1
  49. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/hydra_uploader_helper_behavior.rb +1 -1
  50. data/lib/{cul_scv_hydra → cul_hydra}/controllers/helpers/resources_helper_behavior.rb +1 -1
  51. data/lib/cul_hydra/controllers/resources.rb +161 -0
  52. data/lib/cul_hydra/controllers/static_image_aggregators.rb +105 -0
  53. data/lib/cul_hydra/controllers/suggestions.rb +126 -0
  54. data/lib/cul_hydra/controllers/terms.rb +205 -0
  55. data/lib/cul_hydra/engine.rb +31 -0
  56. data/lib/cul_hydra/fedora.rb +41 -0
  57. data/lib/cul_hydra/fedora/dummy_object.rb +37 -0
  58. data/lib/cul_hydra/fedora/rubydora_patch.rb +16 -0
  59. data/lib/cul_hydra/fedora/url_helper_behavior.rb +32 -0
  60. data/lib/cul_hydra/indexer.rb +84 -0
  61. data/lib/cul_hydra/om.rb +7 -0
  62. data/lib/cul_hydra/om/standard_mods.rb +115 -0
  63. data/lib/cul_hydra/risearch_members.rb +92 -0
  64. data/lib/cul_hydra/solrizer.rb +10 -0
  65. data/lib/cul_hydra/solrizer/extractor.rb +27 -0
  66. data/lib/cul_hydra/solrizer/mods_fieldable.rb +435 -0
  67. data/lib/cul_hydra/solrizer/terminology_based_solrizer.rb +35 -0
  68. data/lib/cul_hydra/solrizer/value_mapper.rb +46 -0
  69. data/lib/{cul_scv_hydra/solrizer/field_mapper.rb → cul_hydra/solrizer_patch.rb} +0 -0
  70. data/lib/cul_hydra/version.rb +8 -0
  71. data/lib/cul_hydra/version.rb~ +8 -0
  72. data/lib/cul_scv_fedora/dummy_object.rb +1 -30
  73. data/lib/cul_scv_fedora/rubydora_patch.rb +3 -7
  74. data/lib/cul_scv_fedora/url_helper_behavior.rb +3 -23
  75. data/lib/cul_scv_hydra.rb +5 -32
  76. data/lib/cul_scv_hydra/access_controls_enforcement.rb +3 -50
  77. data/lib/cul_scv_hydra/controllers.rb +10 -10
  78. data/lib/cul_scv_hydra/controllers/aggregates.rb +1 -86
  79. data/lib/cul_scv_hydra/controllers/aggregator_controller_helper.rb +4 -23
  80. data/lib/cul_scv_hydra/controllers/catalog.rb +5 -9
  81. data/lib/cul_scv_hydra/controllers/content_aggregators.rb +4 -77
  82. data/lib/cul_scv_hydra/controllers/datastreams.rb +3 -140
  83. data/lib/cul_scv_hydra/controllers/helpers.rb +44 -8
  84. data/lib/cul_scv_hydra/controllers/resources.rb +4 -157
  85. data/lib/cul_scv_hydra/controllers/static_image_aggregators.rb +4 -100
  86. data/lib/cul_scv_hydra/controllers/suggestions.rb +4 -122
  87. data/lib/cul_scv_hydra/controllers/terms.rb +4 -201
  88. data/lib/cul_scv_hydra/engine.rb +1 -1
  89. data/lib/cul_scv_hydra/indexer.rb +3 -82
  90. data/lib/cul_scv_hydra/om.rb +2 -2
  91. data/lib/cul_scv_hydra/om/standard_mods.rb +1 -108
  92. data/lib/cul_scv_hydra/risearch_members.rb +4 -89
  93. data/lib/cul_scv_hydra/solrizer.rb +5 -6
  94. data/lib/cul_scv_hydra/solrizer/extractor.rb +1 -25
  95. data/lib/cul_scv_hydra/solrizer/scv_mods_fieldable.rb +4 -429
  96. data/lib/cul_scv_hydra/solrizer/terminology_based_solrizer.rb +4 -32
  97. data/lib/cul_scv_hydra/solrizer/value_mapper.rb +1 -44
  98. data/lib/cul_scv_hydra/version.rb +5 -5
  99. data/lib/tasks/index.rake +2 -2
  100. data/lib/tasks/transform.rake +23 -0
  101. metadata +55 -12
@@ -0,0 +1,32 @@
1
+ module Cul
2
+ module Hydra
3
+ module Fedora
4
+ module UrlHelperBehavior
5
+
6
+ def fedora_url
7
+ @fedora_url ||= ActiveFedora.config.credentials[:url]
8
+ end
9
+
10
+ def pid_for_url(pid)
11
+ pid.gsub(/^\//,'').gsub(/info:fedora\//,'')
12
+ end
13
+
14
+ def fedora_object_url(pid)
15
+ fedora_url + '/objects/' + pid_for_url(pid)
16
+ end
17
+
18
+ def fedora_ds_url(pid, dsid)
19
+ fedora_object_url(pid) + '/datastreams/' + dsid
20
+ end
21
+
22
+ def fedora_method_url(pid, method)
23
+ fedora_object_url(pid) + '/methods/' + method
24
+ end
25
+
26
+ def fedora_risearch_url
27
+ fedora_url + '/risearch'
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,84 @@
1
+ module Cul::Hydra::Indexer
2
+
3
+ def self.descend_from(pid, pids_to_omit=nil, verbose_output=false)
4
+ if pid.blank?
5
+ raise 'Please supply a pid (e.g. rake recursively_index_fedora_objects pid=ldpd:123)'
6
+ end
7
+
8
+ begin
9
+
10
+ unless ActiveFedora::Base.exists?(pid)
11
+ raise 'Could not find Fedora object with pid: ' + pid
12
+ end
13
+
14
+ if pids_to_omit.present? && pids_to_omit.include?(pid)
15
+ puts 'Skipping topmost object in this set (' + pid + ') because it has been intentionally omitted...' if verbose_output
16
+ else
17
+ puts 'Indexing topmost object in this set (' + pid + ')...' if verbose_output
18
+ puts 'If this is a BagAggregator with a lot of members, this may take a while...' if verbose_output
19
+
20
+ yield pid
21
+
22
+ end
23
+
24
+ puts 'Recursively retreieving and indexing all members of ' + pid + '...'
25
+
26
+ unique_pids = Cul::Hydra::RisearchMembers.get_recursive_member_pids(pid, true)
27
+
28
+ total_number_of_members = unique_pids.length
29
+ puts 'Recursive search found ' + total_number_of_members.to_s + ' members.' if verbose_output
30
+
31
+ if pids_to_omit.present?
32
+ unique_pids = unique_pids - pids_to_omit
33
+ total_number_of_members = unique_pids.length
34
+ puts 'After checking against the list of omitted pids, the total number of objects to index will be: ' + total_number_of_members.to_s if verbose_output
35
+ end
36
+
37
+ i = 1
38
+ if total_number_of_members > 0
39
+ unique_pids.each {|pid|
40
+
41
+ puts 'Recursing on ' + i.to_s + ' of ' + total_number_of_members.to_s + ' members (' + pid + ')...' if verbose_output
42
+
43
+ yield pid
44
+
45
+ i += 1
46
+ }
47
+ end
48
+
49
+ rescue RestClient::Unauthorized => e
50
+ error_message = "Skipping #{pid} due to error: " + e.message + '. Problem with Fedora object?'
51
+ puts error_message
52
+ logger.error error_message if defined?(logger)
53
+ end
54
+
55
+ puts 'Recursion complete!'
56
+
57
+ end
58
+ def self.recursively_index_fedora_objects(top_pid, pids_to_omit=nil, skip_generic_resources=false, verbose_output=false)
59
+
60
+ descend_from(top_pid, pids_to_omit, verbose_output) do |pid|
61
+ self.index_pid(pid, skip_generic_resources, verbose_output)
62
+ end
63
+
64
+ end
65
+
66
+ def self.index_pid(pid, skip_generic_resources=false, verbose_output=false)
67
+ # We found an object with the desired PID. Let's reindex it
68
+ begin
69
+ active_fedora_object = ActiveFedora::Base.find(pid, :cast => true)
70
+
71
+ if skip_generic_resources && active_fedora_object.is_a?(GenericResource)
72
+ puts 'Object was skipped because GenericResources are being skipped and it is a GenericResource.'
73
+ else
74
+ active_fedora_object.update_index
75
+ puts 'done.' if verbose_output
76
+ end
77
+ rescue SystemExit, Interrupt => e
78
+ # Allow system interrupt (ctrl+c)
79
+ raise e
80
+ rescue Exception => e
81
+ puts "Encountered problem with #{pid}. Skipping record. Exception: #{e.message}"
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,7 @@
1
+ module Cul
2
+ module Hydra
3
+ module Om
4
+ autoload :StandardMods, "cul_hydra/om/standard_mods"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,115 @@
1
+ require 'active-fedora'
2
+ require 'solrizer'
3
+ require 'cul_hydra/solrizer_patch'
4
+ module Cul
5
+ module Hydra
6
+ module Om
7
+ class StandardMods < ::ActiveFedora::OmDatastream
8
+
9
+ set_terminology do |t|
10
+ t.root(:path=>"mods",
11
+ :xmlns=>"http://www.loc.gov/mods/v3",
12
+ :schema=>"http://www.loc.gov/standards/mods/v3/mods-3-4.xsd")
13
+ t.identifier(:path=>"identifier", :attributes=>{:type=>"local"}, :data_type=>:symbol)
14
+ t.clio(:path=>"identifier", :attributes=>{:type=>"CLIO"}, :data_type=>:symbol)
15
+ t.title_info(:path=>"titleInfo", :index_as=>[:not_searchable]) {
16
+ t.main_title(:path=>"title", :index_as=>[:not_searchable])
17
+ }
18
+ t.title(:path=>'mods/oxns:titleInfo/oxns:title', :index_as=>[:searchable,:displayable, :sortable])
19
+ t.abstract
20
+ t.subject {
21
+ t.topic
22
+ }
23
+ t.type_of_resource(:path=>"typeOfResource", :index_as=>[:not_searchable])
24
+ t.physical_description(:path=>"physicalDescription", :index_as=>[:not_searchable]){
25
+ t.form_marc(:path=>"form", :attributes=>{:authority=>"marcform"}, :index_as=>[:not_searchable])
26
+ t.form_nomarc(:path=>"form[@authority !='marcform']", :index_as=>[:not_searchable, :displayable, :facetable, :textable])
27
+ t.extent(:path=>"extent", :index_as=>[:not_searchable])
28
+ t.reformatting_quality(:path=>"reformattingQuality", :index_as=>[:not_searchable])
29
+ t.internet_media_type(:path=>"internetMediaType", :index_as=>[:not_searchable])
30
+ t.digital_origin(:path=>"digitalOrigin", :index_as=>[:not_searchable])
31
+ }
32
+ t.lib_format(proxy: [:physical_description, :form_nomarc] )
33
+ t.location(:path=>"location", :index_as=>[:not_searchable]){
34
+ t.repo_text(:path=>"physicalLocation",:attributes=>{:authority=>:none}, :index_as=>[:not_searchable])
35
+ t.repo_code(:path=>"physicalLocation",:attributes=>{:authority=>"marcorg"}, :index_as=>[:not_searchable])
36
+ }
37
+ t.lib_repo_text(:ref=>[:location, :repo_text], :label=>"lib_repo", :index_as=>[:searchable])
38
+ t.lib_repo(:ref=>[:location, :repo_code], :index_as=>[:not_searchable,:facetable, :displayable])
39
+ t.project_host(:path=>"relatedItem", :attributes=>{:type=>"host", :displayLabel=>"Project"}, :index_as=>[:not_searchable]){
40
+ t.p_title(:path=>'titleInfo',:index_as=>[:not_searchable])
41
+ }
42
+ t.lib_project(:proxy=>[:project_host, :p_title],:index_as=>[:facetable,:displayable, :not_searchable])
43
+ t.collection_host(:path=>"relatedItem", :attributes=>{:type=>"host", :displayLabel=>"Collection"}, :index_as=>[:not_searchable]){
44
+ t.c_title(:path=>'titleInfo',:index_as=>[:not_searchable])
45
+ }
46
+ t.lib_project(:path=>"mods/oxns:relatedItem[@type='host'][@displayLabel='Project']/oxns:titleInfo/oxns:title",:index_as=>[:facetable,:displayable, :not_searchable])
47
+ t.lib_collection(:path=>"mods/oxns:relatedItem[@type='host'][@displayLabel='Collection']/oxns:titleInfo/oxns:title",:index_as=>[:facetable,:displayable, :not_searchable])
48
+ t.note(:path=>"note")
49
+ t.access_condition(:path=>"accessCondition", :attributes=>{:type=>"useAndReproduction"}, :index_as => [:searchable], :data_type => :symbol)
50
+ t.record_info(:path=>"recordInfo", :index_as=>[:not_searchable]) {
51
+ t.record_creation_date(:path=>"recordCreationDate",:attributes=>{:encoding=>"w3cdtf"}, :index_as=>[:not_searchable])
52
+ t.record_content_source(:path=>"recordContentSource",:attributes=>{:authority=>"marcorg"}, :index_as=>[:not_searchable])
53
+ t.language_of_cataloging(:path=>"languageOfCataloging", :index_as=>[:not_searchable]){
54
+ t.language_term(:path=>"languageTerm", :index_as=>[:not_searchable], :attributes=>{:type=>:none})
55
+ t.language_code(:path=>"languageTerm",:attributes=>{:type=>'code',:authority=>"iso639-2b"}, :index_as=>[:not_searchable])
56
+ }
57
+ t.record_origin(:path=>"recordOrigin", :index_as=>[:not_searchable])
58
+ }
59
+
60
+ t.origin_info(:path=>"originInfo", :index_as=>[:not_searchable]){
61
+ t.date(:path=>"dateIssued", :attributes=>{:encoding=>'w3cdtf'}, :index_as=>[:not_searchable])
62
+ t.key_date(:path=>"dateIssued", :attributes=>{:encoding=>'w3cdtf',:keyDate=>'yes'}, :index_as=>[:not_searchable])
63
+ t.start_date(:path=>"dateIssued", :attributes=>{:encoding=>'w3cdtf',:keyDate=>'yes',:point=>'start'}, :index_as=>[:not_searchable])
64
+ t.end_date(:path=>"dateIssued", :attributes=>{:encoding=>'w3cdtf',:point=>'end'}, :index_as=>[:not_searchable])
65
+ }
66
+ end
67
+
68
+ def self.xml_template
69
+ builder = Nokogiri::XML::Builder.new do |xml|
70
+ xml.mods(:version=>"3.4",
71
+ "xmlns"=>"http://www.loc.gov/mods/v3",
72
+ "xmlns:xsi"=>"http://www.w3.org/2001/XMLSchema-instance"){
73
+ }
74
+ end
75
+ builder.doc.encoding = 'UTF-8'
76
+ builder.doc.root["xsi:schemaLocation"] = 'http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd'
77
+ return builder.doc
78
+ end
79
+
80
+ def prefix
81
+ #if ::ActiveFedora::VERSION >= '8'
82
+ # Rails.logger.warn("the prefix method of #{self.class.name} was overriden to maintain backwards compatibility")
83
+ #end
84
+ ''
85
+ end
86
+
87
+ def method_missing method, *args
88
+ query = false
89
+ _mname = method.id2name
90
+ if _mname[-1,1] == '?'
91
+ query = true
92
+ _mname = _mname[0,_mname.length-1]
93
+ end
94
+ _msym = _mname.to_sym
95
+ begin
96
+ has_term = self.class.terminology.has_term?(_msym)
97
+
98
+ _r = (has_term)? find_by_terms(_msym, *args) : nil
99
+ if query
100
+ return !( _r.nil? || _r.size()==0)
101
+ else
102
+ return _r
103
+ end
104
+ rescue
105
+ super
106
+ end
107
+ end
108
+ def update_values(params)
109
+ super
110
+ self.dirty = true
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,92 @@
1
+ module Cul::Hydra::RisearchMembers
2
+ module ClassMethods
3
+ def get_recursive_member_pids(pid, verbose_output=false, cmodel_type='all')
4
+
5
+ recursive_member_query =
6
+ 'select $child $parent from <#ri>
7
+ where
8
+ walk($child <http://purl.oclc.org/NET/CUL/memberOf> <fedora:' + pid + '> and $child <http://purl.oclc.org/NET/CUL/memberOf> $parent)'
9
+
10
+ unless cmodel_type == 'all'
11
+ recursive_member_query += ' and $child <fedora-model:hasModel> $cmodel'
12
+ recursive_member_query += ' and $cmodel <mulgara:is> <info:fedora/ldpd:' + cmodel_type + '>'
13
+ end
14
+
15
+ puts 'Performing query:' if verbose_output
16
+ puts recursive_member_query if verbose_output
17
+
18
+ search_response = JSON(Cul::Hydra::Fedora.repository.find_by_itql(recursive_member_query, {
19
+ :type => 'tuples',
20
+ :format => 'json',
21
+ :limit => '',
22
+ :stream => 'on'
23
+ }))
24
+
25
+ unique_pids = search_response['results'].map{|result| result['child'].gsub('info:fedora/', '') }.uniq
26
+
27
+ return unique_pids
28
+
29
+ end
30
+
31
+ def get_direct_member_results(pid, verbose_output=false, format='json')
32
+
33
+ direct_member_query =
34
+ 'select $pid from <#ri>
35
+ where $pid <http://purl.oclc.org/NET/CUL/memberOf> <fedora:' + pid + '>'
36
+
37
+ puts 'Performing query:' if verbose_output
38
+ puts direct_member_query if verbose_output
39
+
40
+ search_response = JSON(Cul::Hydra::Fedora.repository.find_by_itql(direct_member_query, {
41
+ :type => 'tuples',
42
+ :format => format,
43
+ :limit => '',
44
+ :stream => 'on'
45
+ }))
46
+
47
+ return search_response['results']
48
+ end
49
+
50
+ def get_direct_member_pids(pid, verbose_output=false)
51
+ unique_pids = get_direct_member_results(pid,verbose_output,'json')
52
+ unique_pids.map{|result| result['pid'].gsub('info:fedora/', '') }.uniq
53
+ end
54
+
55
+ def get_direct_member_count(pid, verbose_output=false)
56
+ count = get_direct_member_results(pid,verbose_output,'count/json')
57
+ return count.blank? ? 0 : count[0]['count'].to_i
58
+ end
59
+
60
+ #Project constituents
61
+
62
+ def get_project_constituent_results(pid, verbose_output=false, format='json')
63
+
64
+ project_constituent_query =
65
+ 'select $pid from <#ri>
66
+ where $pid <info:fedora/fedora-system:def/relations-external#isConstituentOf> <fedora:' + pid + '>'
67
+
68
+ puts 'Performing query:' if verbose_output
69
+ puts project_constituent_query if verbose_output
70
+
71
+ search_response = JSON(Cul::Hydra::Fedora.repository.find_by_itql(project_constituent_query, {
72
+ :type => 'tuples',
73
+ :format => format,
74
+ :limit => '',
75
+ :stream => 'on'
76
+ }))
77
+
78
+ return search_response['results']
79
+ end
80
+
81
+ def get_project_constituent_pids(pid, verbose_output=false)
82
+ unique_pids = get_project_constituent_results(pid,verbose_output,'json')
83
+ unique_pids.map{|result| result['pid'].gsub('info:fedora/', '') }.uniq
84
+ end
85
+
86
+ def get_project_constituent_count(pid, verbose_output=false)
87
+ count = get_project_constituent_results(pid,verbose_output,'count/json')
88
+ return count.blank? ? 0 : count[0]['count'].to_i
89
+ end
90
+ end
91
+ extend ClassMethods
92
+ end
@@ -0,0 +1,10 @@
1
+ module Cul
2
+ module Hydra
3
+ module Solrizer
4
+ autoload :Extractor, "cul_hydra/solrizer/extractor"
5
+ autoload :TerminologyBasedSolrizer, "cul_hydra/solrizer/terminology_based_solrizer"
6
+ autoload :ValueMapper, "cul_hydra/solrizer/value_mapper"
7
+ autoload :ModsFieldable, "cul_hydra/solrizer/mods_fieldable"
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,27 @@
1
+ module Cul::Hydra::Solrizer
2
+ class Extractor < ::Solrizer::Extractor
3
+ # Insert +field_value+ for +field_name+ into +solr_doc+
4
+ # Handles inserting new values into a Hash while ensuring that you don't destroy or overwrite any existing values in the hash.
5
+ # Ensures that field values are always appended to arrays within the values hash.
6
+ # Ensures that values are run through format_node_value
7
+ # Also ensures that values are unique if specified
8
+ # @param [Hash] solr_doc
9
+ # @param [String] field_name
10
+ # @param [String] field_value
11
+ # @param [boolean] unique
12
+ def self.insert_solr_field_value(solr_doc, field_name, field_value, unique=false)
13
+ formatted_value = self.format_node_value(field_value)
14
+ if solr_doc.has_key?(field_name)
15
+ solr_doc[field_name] << formatted_value unless (unique and solr_doc[field_name].include? formatted_value)
16
+ else
17
+ solr_doc.merge!( {field_name => [formatted_value]} )
18
+ end
19
+ return solr_doc
20
+ end
21
+
22
+ # Instance Methods
23
+ def insert_solr_field_value(solr_doc, field_name, field_value, unique=false)
24
+ Cul::Hydra::Solrizer::Extractor.insert_solr_field_value(solr_doc, field_name, field_value, unique)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,435 @@
1
+ module Cul::Hydra::Solrizer
2
+ module ModsFieldable
3
+ extend ActiveSupport::Concern
4
+ include Solrizer::DefaultDescriptors::Normal
5
+
6
+ MODS_NS = {'mods'=>'http://www.loc.gov/mods/v3'}
7
+
8
+ module ClassMethods
9
+ def value_mapper(maps=nil)
10
+ @value_mapper ||= ValueMapper.new(maps)
11
+ end
12
+
13
+ def map_field(field_key, map_key)
14
+ value_mapper.map_field(field_key, map_key)
15
+ end
16
+
17
+ def map_value(field_key, value_key)
18
+ value_mapper.map_value(field_key, value_key)
19
+ end
20
+
21
+ def maps_field?(field_key)
22
+ value_mapper.maps_field? field_key
23
+ end
24
+ def normalize(t, strip_punctuation=false)
25
+ # strip whitespace
26
+ n_t = t.dup.strip
27
+ # collapse intermediate whitespace
28
+ n_t.gsub!(/\s+/, ' ')
29
+ # pull off paired punctuation, and any leading punctuation
30
+ if strip_punctuation
31
+ n_t = n_t.sub(/^\((.*)\)$/, "\\1")
32
+ n_t = n_t.sub(/^\{(.*)\}$/, "\\1")
33
+ n_t = n_t.sub(/^\[(.*)\]$/, "\\1")
34
+ n_t = n_t.sub(/^"(.*)"$/, "\\1")
35
+ n_t = n_t.sub(/^'(.*)'$/, "\\1")
36
+ n_t = n_t.sub(/^<(.*)>$/, "\\1")
37
+ #n_t = n_t.sub(/^\p{Ps}(.*)\p{Pe}/u, "\\1")
38
+ n_t = n_t.sub(/^[[:punct:]]+/, '')
39
+ # this may have 'created' leading/trailing space, so strip
40
+ n_t.strip!
41
+ end
42
+ n_t
43
+ end
44
+ end
45
+
46
+ extend ClassMethods
47
+
48
+ def mods
49
+ ng_xml.xpath('/mods:mods', MODS_NS).first
50
+ end
51
+
52
+ def projects
53
+ mods.xpath("./mods:relatedItem[@type='host' and @displayLabel='Project']", MODS_NS).collect do |p_node|
54
+ ModsFieldable.normalize(main_title(p_node), true)
55
+ end
56
+ end
57
+
58
+ def collections
59
+ mods.xpath("./mods:relatedItem[@type='host' and @displayLabel='Collection']", MODS_NS).collect do |p_node|
60
+ ModsFieldable.normalize(main_title(p_node), true)
61
+ end
62
+ end
63
+
64
+ def sort_title(node=mods)
65
+ # include only the untyped [!@type] titleInfo, exclude noSort
66
+ base_text = ''
67
+ t = node.xpath('./mods:titleInfo[not(@type)]', MODS_NS).first
68
+ if t
69
+ t.children.each do |child|
70
+ base_text << child.text unless child.name == 'nonSort'
71
+ end
72
+ end
73
+ base_text = ModsFieldable.normalize(base_text, true)
74
+ base_text = nil if base_text.empty?
75
+ base_text
76
+ end
77
+
78
+ def main_title(node=mods)
79
+ # include only the untyped [!@type] titleInfo
80
+ t = node.xpath('./mods:titleInfo[not(@type)]', MODS_NS).first
81
+ if t
82
+ ModsFieldable.normalize(t.text)
83
+ else
84
+ nil
85
+ end
86
+ end
87
+
88
+ def titles(node=mods)
89
+ # all titles without descending into relatedItems
90
+ # For now, this only includes the main title and selected alternate_titles
91
+ all_titles = []
92
+ all_titles << main_title unless main_title.nil?
93
+ all_titles += alternative_titles unless alternative_titles.nil?
94
+ end
95
+
96
+ def alternative_titles(node=mods)
97
+ node.xpath('./mods:titleInfo[@type and (@type="alternative" or @type="abbreviated" or @type="translated" or @type="uniform")]', MODS_NS).collect do |t|
98
+ ModsFieldable.normalize(t.text)
99
+ end
100
+ end
101
+
102
+ def names(role_authority=nil, role=nil)
103
+ # get all the name nodes
104
+ # keep all child text except the role terms
105
+ xpath = "./mods:name"
106
+ unless role_authority.nil?
107
+ xpath << "/mods:role/mods:roleTerm[@authority='#{role_authority.to_s}'"
108
+ unless role.nil?
109
+ xpath << " and normalize-space(text()) = '#{role.to_s.strip}'"
110
+ end
111
+ xpath << "]/ancestor::mods:name"
112
+ end
113
+ names = mods.xpath(xpath, MODS_NS).collect do |node|
114
+ base_text = node.xpath('./mods:namePart', MODS_NS).collect { |c| c.text }.join(' ')
115
+ ModsFieldable.normalize(base_text, true)
116
+ end
117
+
118
+ # Note: Removing subject names from name field extraction.
119
+ # See: https://issues.cul.columbia.edu/browse/DCV-231 and https://issues.cul.columbia.edu/browse/SCV-102
120
+ #xpath = "./mods:subject" + xpath[1,xpath.length]
121
+ #mods.xpath(xpath, MODS_NS).each do |node|
122
+ # base_text = node.xpath('./mods:namePart', MODS_NS).collect { |c| c.text }.join(' ')
123
+ # names << ModsFieldable.normalize(base_text, true)
124
+ #end
125
+
126
+ names
127
+ end
128
+
129
+ def dates(node=mods)
130
+ # get all the dateIssued with keyDate = 'yes', but not point = 'end'
131
+ end
132
+
133
+ def formats(node=mods)
134
+ # get all the form values with authority != 'marcform'
135
+ node.xpath("./mods:physicalDescription/mods:form[@authority != 'marcform']", MODS_NS).collect do |n|
136
+ ModsFieldable.normalize(n.text)
137
+ end
138
+ end
139
+
140
+ def repository_code(node=mods)
141
+ # get the location/physicalLocation[@authority = 'marcorg']
142
+ repo_code_node = node.xpath("./mods:location/mods:physicalLocation[@authority = 'marcorg']", MODS_NS).first
143
+
144
+ if repo_code_node
145
+ ModsFieldable.normalize(repo_code_node.text)
146
+ else
147
+ return nil
148
+ end
149
+ end
150
+
151
+ def repository_text(node=mods)
152
+ # get the location/physicalLocation[not(@authority)]
153
+ repo_text_node = node.xpath("./mods:location/mods:physicalLocation[not(@authority)]", MODS_NS).first
154
+
155
+ if repo_text_node
156
+ ModsFieldable.normalize(repo_text_node.text)
157
+ else
158
+ return nil
159
+ end
160
+ end
161
+
162
+ def translate_repo_marc_code(code, type)
163
+ #code = ModsFieldable.normalize(code)
164
+
165
+ if type == 'short'
166
+ return translate_with_default(SHORT_REPO, code, 'Non-Columbia Location')
167
+ elsif type == 'long'
168
+ return translate_with_default(LONG_REPO, code, 'Non-Columbia Location')
169
+ elsif type == 'full'
170
+ return translate_with_default(FULL_REPO, code, 'Non-Columbia Location')
171
+ end
172
+
173
+ return nil
174
+ end
175
+
176
+ def translate_project_title(project_title, type)
177
+ normalized_project_title = ModsFieldable.normalize(project_title)
178
+
179
+ if type == 'short'
180
+ return translate_with_default(SHORT_PROJ, normalized_project_title, normalized_project_title)
181
+ elsif type == 'full'
182
+ return translate_with_default(FULL_PROJ, normalized_project_title, normalized_project_title)
183
+ end
184
+
185
+ return nil
186
+ end
187
+
188
+ def shelf_locators(node=mods)
189
+ node.xpath("./mods:location/mods:shelfLocator", MODS_NS).collect do |n|
190
+ ModsFieldable.normalize(n.text, true)
191
+ end
192
+ end
193
+
194
+ def textual_dates(node=mods)
195
+ dates = []
196
+ node.xpath("./mods:originInfo/mods:dateCreated[not(@keyDate) and not(@point) and not(@w3cdtf)]", MODS_NS).collect do |n|
197
+ dates << ModsFieldable.normalize(n.text, true)
198
+ end
199
+ node.xpath("./mods:originInfo/mods:dateIssued[not(@keyDate) and not(@point) and not(@w3cdtf)]", MODS_NS).collect do |n|
200
+ dates << ModsFieldable.normalize(n.text, true)
201
+ end
202
+ node.xpath("./mods:originInfo/mods:dateOther[not(@keyDate) and not(@point) and not(@w3cdtf)]", MODS_NS).collect do |n|
203
+ dates << ModsFieldable.normalize(n.text, true)
204
+ end
205
+ return dates
206
+ end
207
+
208
+ def date_range_to_textual_date(start_year, end_year)
209
+ start_year = start_year.to_i.to_s # Remove zero-padding if present
210
+ end_year = end_year.to_i.to_s # Remove zero-padding if present
211
+
212
+ if start_year == end_year
213
+ return [start_year]
214
+ else
215
+ return [('Between ' +
216
+ (start_year.to_i > 0 ? start_year : start_year[1,start_year.length] + ' BCE') +
217
+ ' and ' +
218
+ (end_year.to_i > 0 ? (start_year.to_i > 0 ? end_year : end_year + ' CE') : end_year[1,end_year.length] + ' BCE')
219
+ )]
220
+ end
221
+ end
222
+
223
+ def date_notes(node=mods)
224
+ date_notes = []
225
+ node.xpath("./mods:note[@type = 'date' or @type = 'date source']", MODS_NS).collect do |n|
226
+ date_notes << ModsFieldable.normalize(n.text, true)
227
+ end
228
+ return date_notes
229
+ end
230
+
231
+ def non_date_notes(node=mods)
232
+ non_date_notes = []
233
+ node.xpath("./mods:note[not(@type) or (@type != 'date' and @type != 'date source')]", MODS_NS).collect do |n|
234
+ non_date_notes << ModsFieldable.normalize(n.text, true)
235
+ end
236
+ return non_date_notes
237
+ end
238
+
239
+ def item_in_context_url(node=mods)
240
+ item_in_context_url_val = []
241
+ node.xpath("./mods:location/mods:url[@access='object in context' and @usage='primary display']", MODS_NS).collect do |n|
242
+ item_in_context_url_val << ModsFieldable.normalize(n.text, true)
243
+ end
244
+ item_in_context_url_val
245
+ end
246
+
247
+ def project_url(node=mods)
248
+ project_url_val = []
249
+ node.xpath("./mods:relatedItem[@type='host' and @displayLabel='Project']/mods:location/mods:url", MODS_NS).collect do |n|
250
+ project_url_val << ModsFieldable.normalize(n.text, true)
251
+ end
252
+ project_url_val
253
+ end
254
+
255
+ def all_subjects(node=mods)
256
+ list_of_subjects = []
257
+
258
+ node.xpath("./mods:subject/mods:topic", MODS_NS).collect do |n|
259
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
260
+ end
261
+ node.xpath("./mods:subject/mods:geographic", MODS_NS).collect do |n|
262
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
263
+ end
264
+ node.xpath("./mods:subject/mods:name", MODS_NS).collect do |n|
265
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
266
+ end
267
+ node.xpath("./mods:subject/mods:temporal", MODS_NS).collect do |n|
268
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
269
+ end
270
+ node.xpath("./mods:subject/mods:titleInfo", MODS_NS).collect do |n|
271
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
272
+ end
273
+ node.xpath("./mods:subject/mods:genre", MODS_NS).collect do |n|
274
+ list_of_subjects << ModsFieldable.normalize(n.text, true)
275
+ end
276
+
277
+ return list_of_subjects
278
+ end
279
+
280
+ def origin_info_place(node=mods)
281
+ places = []
282
+ node.xpath("./mods:originInfo/mods:place/mods:placeTerm", MODS_NS).collect do |n|
283
+ places << ModsFieldable.normalize(n.text, true)
284
+ end
285
+ return places
286
+ end
287
+
288
+ def origin_info_place_for_display(node=mods)
289
+ # If there are multiple origin_info place elements, choose only the ones without valueURI attributes. Otherwise show the others.
290
+ places_with_uri = []
291
+ places_without_uri = []
292
+ node.xpath("./mods:originInfo/mods:place/mods:placeTerm[@valueURI]", MODS_NS).collect do |n|
293
+ places_with_uri << ModsFieldable.normalize(n.text, true)
294
+ end
295
+ node.xpath("./mods:originInfo/mods:place/mods:placeTerm[not(@valueURI)]", MODS_NS).collect do |n|
296
+ places_without_uri << ModsFieldable.normalize(n.text, true)
297
+ end
298
+
299
+ return (places_without_uri.length > 0 ? places_without_uri : places_with_uri)
300
+ end
301
+
302
+ def coordinates(node=mods)
303
+ coordinate_values = []
304
+ node.xpath("./mods:subject/mods:cartographics/mods:coordinates", MODS_NS).collect do |n|
305
+ n = ModsFieldable.normalize(n.text, true)
306
+ if n.match(/-*\d+\.\d+\s*,\s*-*\d+\.\d+\s*/) # Expected coordinate format: 40.123456,-73.5678
307
+ coordinate_values << n
308
+ end
309
+ end
310
+ coordinate_values
311
+ end
312
+
313
+ def to_solr(solr_doc={})
314
+ solr_doc = (defined? super) ? super : solr_doc
315
+
316
+ return solr_doc if mods.nil? # There is no mods. Return because there is nothing to process, otherwise NoMethodError will be raised by subsequent lines.
317
+
318
+ solr_doc["all_text_teim"] ||= []
319
+
320
+ solr_doc["title_si"] = sort_title
321
+ solr_doc["title_ssm"] = titles
322
+ solr_doc["alternative_title_ssm"] = alternative_titles
323
+ solr_doc["all_text_teim"] += solr_doc["alternative_title_ssm"]
324
+ solr_doc["lib_collection_sim"] = collections
325
+ solr_doc["lib_name_sim"] = names
326
+ solr_doc["lib_name_teim"] = solr_doc["lib_name_sim"]
327
+ solr_doc["all_text_teim"] += solr_doc["lib_name_teim"]
328
+ solr_doc["lib_all_subjects_ssm"] = all_subjects
329
+ solr_doc["lib_all_subjects_teim"] = solr_doc["lib_all_subjects_ssm"]
330
+ solr_doc["all_text_teim"] += solr_doc["lib_all_subjects_teim"]
331
+ solr_doc["lib_name_ssm"] = solr_doc["lib_name_sim"]
332
+ solr_doc["lib_author_sim"] = names(:marcrelator, 'aut')
333
+ solr_doc["lib_recipient_sim"] = names(:marcrelator, 'rcp')
334
+ solr_doc["lib_format_sim"] = formats
335
+ solr_doc["lib_shelf_sim"] = shelf_locators
336
+ solr_doc["lib_date_textual_ssm"] = textual_dates
337
+ solr_doc["lib_date_notes_ssm"] = date_notes
338
+ solr_doc["lib_non_date_notes_ssm"] = non_date_notes
339
+ solr_doc["lib_item_in_context_url_ssm"] = item_in_context_url
340
+ solr_doc["lib_project_url_ssm"] = project_url
341
+ solr_doc["origin_info_place_ssm"] = origin_info_place
342
+ solr_doc["origin_info_place_for_display_ssm"] = origin_info_place_for_display
343
+
344
+ repo_marc_code = repository_code
345
+ unless repo_marc_code.nil?
346
+ solr_doc["lib_repo_short_ssim"] = [translate_repo_marc_code(repo_marc_code, 'short')]
347
+ solr_doc["lib_repo_long_sim"] = [translate_repo_marc_code(repo_marc_code, 'long')]
348
+ solr_doc["lib_repo_full_ssim"] = [translate_repo_marc_code(repo_marc_code, 'full')]
349
+ end
350
+ solr_doc["lib_repo_text_ssm"] = repository_text
351
+
352
+ project_titles = projects
353
+ unless project_titles.nil?
354
+ solr_doc["lib_project_short_ssim"] = []
355
+ solr_doc["lib_project_full_ssim"] = []
356
+ project_titles.each {|project_title|
357
+ solr_doc["lib_project_short_ssim"] << translate_project_title(project_title, 'short')
358
+ solr_doc["lib_project_full_ssim"] << translate_project_title(project_title, 'full')
359
+ }
360
+ solr_doc["lib_project_short_ssim"].uniq!
361
+ solr_doc["lib_project_full_ssim"].uniq!
362
+ end
363
+
364
+ # Create convenient start and end date values based on one of the many possible originInfo/dateX elements.
365
+ possible_start_date_fields = ['origin_info_date_issued_ssm', 'origin_info_date_issued_start_ssm', 'origin_info_date_created_ssm', 'origin_info_date_created_start_ssm', 'origin_info_date_other_ssm', 'origin_info_date_other_start_ssm']
366
+ possible_end_date_fields = ['origin_info_date_issued_end_ssm', 'origin_info_date_created_end_ssm', 'origin_info_date_other_end_ssm']
367
+ start_date = nil
368
+ end_date = nil
369
+ start_year = nil
370
+ end_year = nil
371
+ possible_start_date_fields.each{|key|
372
+ if solr_doc.has_key?(key)
373
+ start_date = solr_doc[key][0]
374
+ break
375
+ end
376
+ }
377
+ possible_end_date_fields.each{|key|
378
+ if solr_doc.has_key?(key)
379
+ end_date = solr_doc[key][0]
380
+ break
381
+ end
382
+ }
383
+
384
+ if start_date.present?
385
+
386
+ end_date = start_date if end_date.blank?
387
+
388
+ year_regex = /^(-?\d{1,4}).*/
389
+
390
+ start_year_match = start_date.match(year_regex)
391
+ if start_year_match && start_year_match.captures.length > 0
392
+ start_year = start_year_match.captures[0]
393
+ start_year = zero_pad_year(start_year)
394
+ solr_doc["lib_start_date_year_itsi"] = start_year.to_i # TrieInt version for searches
395
+ end
396
+
397
+ end_year_match = end_date.match(year_regex)
398
+ if end_year_match && end_year_match.captures.length > 0
399
+ end_year = end_year_match.captures[0]
400
+ end_year = zero_pad_year(end_year)
401
+ solr_doc["lib_end_date_year_itsi"] = end_year.to_i # TrieInt version for searches
402
+ end
403
+
404
+ solr_doc["lib_date_year_range_si"] = start_year + '-' + end_year if start_year
405
+
406
+ # When no textual date is available, fall back to other date data (if available)
407
+ if solr_doc["lib_date_textual_ssm"].blank?
408
+ solr_doc["lib_date_textual_ssm"] = date_range_to_textual_date(start_year.to_i, end_year.to_i)
409
+ end
410
+ end
411
+
412
+ # Geo data
413
+ solr_doc["geo"] = coordinates
414
+
415
+ solr_doc.each do |k, v|
416
+ if self.class.maps_field? k
417
+ solr_doc[k] = self.class.map_value(k, v)
418
+ end
419
+ end
420
+
421
+ solr_doc
422
+ end
423
+
424
+ def zero_pad_year(year)
425
+ year = year.to_s
426
+ is_negative = year.start_with?('-')
427
+ year_without_sign = (is_negative ? year[1, year.length]: year)
428
+ if year_without_sign.length < 4
429
+ year_without_sign = year_without_sign.rjust(4, '0')
430
+ end
431
+
432
+ return (is_negative ? '-' : '') + year_without_sign
433
+ end
434
+ end
435
+ end