stanford-mods 1.3.3 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  # encoding: UTF-8
2
2
  require 'stanford-mods/searchworks_languages'
3
+ require 'stanford-mods/searchworks_subjects'
3
4
  require 'logger'
4
5
  require 'mods'
5
6
 
@@ -207,289 +208,11 @@ module Stanford
207
208
  # ---- end TITLE ----
208
209
 
209
210
  # ---- SUBJECT ----
210
-
211
- # Values are the contents of:
212
- # subject/geographic
213
- # subject/hierarchicalGeographic
214
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
215
- # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
216
- # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
217
- def sw_geographic_search(sep = ' ')
218
- result = term_values([:subject, :geographic]) || []
219
-
220
- # hierarchicalGeographic has sub elements
221
- @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
222
- hg_vals = []
223
- hg_node.element_children.each { |e|
224
- hg_vals << e.text unless e.text.empty?
225
- }
226
- result << hg_vals.join(sep) unless hg_vals.empty?
227
- }
228
-
229
- trans_code_vals = @mods_ng_xml.subject.geographicCode.translated_value
230
- if trans_code_vals
231
- trans_code_vals.each { |val|
232
- result << val if !result.include?(val)
233
- }
234
- end
235
-
236
- result
237
- end
238
-
239
- # Values are the contents of:
240
- # subject/name/namePart
241
- # "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
242
- # @param [String] sep - the separator string for joining namePart sub elements
243
- # @return [Array<String>] values for names inside subject elements or [] if none
244
- def sw_subject_names(sep = ', ')
245
- result = []
246
- @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np|
247
- parts = name_el_w_np.namePart.map { |npn| npn.text unless npn.text.empty? }.compact
248
- result << parts.join(sep).strip unless parts.empty?
249
- }
250
- result
251
- end
252
-
253
- # Values are the contents of:
254
- # subject/titleInfo/(subelements)
255
- # @param [String] sep - the separator string for joining titleInfo sub elements
256
- # @return [Array<String>] values for titles inside subject elements or [] if none
257
- def sw_subject_titles(sep = ' ')
258
- result = []
259
- @mods_ng_xml.subject.titleInfo.each { |ti_el|
260
- parts = ti_el.element_children.map { |el| el.text unless el.text.empty? }.compact
261
- result << parts.join(sep).strip unless parts.empty?
262
- }
263
- result
264
- end
265
-
266
- # Values are the contents of:
267
- # mods/genre
268
- # mods/subject/topic
269
- # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
270
- def topic_search
271
- @topic_search ||= begin
272
- vals = self.term_values(:genre) || []
273
- vals.concat(subject_topics) if subject_topics
274
- vals.empty? ? nil : vals
275
- end
276
- end
277
-
278
- # Values are the contents of:
279
- # subject/topic
280
- # subject/name
281
- # subject/title
282
- # subject/occupation
283
- # with trailing comma, semicolon, and backslash (and any preceding spaces) removed
284
- # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
285
- def topic_facet
286
- vals = subject_topics ? Array.new(subject_topics) : []
287
- vals.concat(subject_names) if subject_names
288
- vals.concat(subject_titles) if subject_titles
289
- vals.concat(subject_occupations) if subject_occupations
290
- vals.map! { |val|
291
- v = val.sub(/[\\,;]$/, '')
292
- v.strip
293
- }
294
- vals.empty? ? nil : vals
295
- end
296
-
297
- # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
298
- # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
299
- def geographic_facet
300
- geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } unless !geographic_search
301
- end
302
-
303
- # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
304
- # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
305
- def era_facet
306
- subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } unless !subject_temporal
307
- end
308
-
309
- # Values are the contents of:
310
- # subject/geographic
311
- # subject/hierarchicalGeographic
312
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
313
- # @return [Array<String>] values for the geographic_search Solr field for this document or nil if none
314
- def geographic_search
315
- @geographic_search ||= begin
316
- result = self.sw_geographic_search
317
-
318
- # TODO: this should go into stanford-mods ... but then we have to set that gem up with a Logger
319
- # print a message for any unrecognized encodings
320
- xvals = self.subject.geographicCode.translated_value
321
- codes = self.term_values([:subject, :geographicCode])
322
- if codes && codes.size > xvals.size
323
- self.subject.geographicCode.each { |n|
324
- if n.authority != 'marcgac' && n.authority != 'marccountry'
325
- sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
326
- end
327
- }
328
- end
329
-
330
- # FIXME: stanford-mods should be returning [], not nil ...
331
- return nil if !result || result.empty?
332
- result
333
- end
334
- end
335
-
336
- # Values are the contents of:
337
- # subject/name
338
- # subject/occupation - no subelements
339
- # subject/titleInfo
340
- # @return [Array<String>] values for the subject_other_search Solr field for this document or nil if none
341
- def subject_other_search
342
- @subject_other_search ||= begin
343
- vals = subject_occupations ? Array.new(subject_occupations) : []
344
- vals.concat(subject_names) if subject_names
345
- vals.concat(subject_titles) if subject_titles
346
- vals.empty? ? nil : vals
347
- end
348
- end
349
-
350
- # Values are the contents of:
351
- # subject/temporal
352
- # subject/genre
353
- # @return [Array<String>] values for the subject_other_subvy_search Solr field for this document or nil if none
354
- def subject_other_subvy_search
355
- @subject_other_subvy_search ||= begin
356
- vals = subject_temporal ? Array.new(subject_temporal) : []
357
- gvals = self.term_values([:subject, :genre])
358
- vals.concat(gvals) if gvals
359
-
360
- # print a message for any temporal encodings
361
- self.subject.temporal.each { |n|
362
- sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") if !n.encoding.empty?
363
- }
364
-
365
- vals.empty? ? nil : vals
366
- end
367
- end
368
-
369
- # Values are the contents of:
370
- # all subject subelements except subject/cartographic plus genre top level element
371
- # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
372
- def subject_all_search
373
- vals = topic_search ? Array.new(topic_search) : []
374
- vals.concat(geographic_search) if geographic_search
375
- vals.concat(subject_other_search) if subject_other_search
376
- vals.concat(subject_other_subvy_search) if subject_other_subvy_search
377
- vals.empty? ? nil : vals
378
- end
379
-
211
+ # see searchworks_subjects.rb
380
212
  # ---- end SUBJECT ----
381
213
 
382
214
  # ---- PUBLICATION (place, year) ----
383
- def place
384
- vals = self.term_values([:origin_info,:place,:placeTerm])
385
- vals
386
- end
387
-
388
- # For the date display only, the first place to look is in the dates without encoding=marc array.
389
- # If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil
390
- # @return [String] value for the pub_date_display Solr field for this document or nil if none
391
- def pub_date_display
392
- return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
393
- return dates_marc_encoding.first unless dates_marc_encoding.empty?
394
- return nil
395
- end
396
-
397
- # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
398
- # If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil
399
- # @return [Array<String>] values for the date Solr field for this document or nil if none
400
- def pub_dates
401
- return dates_marc_encoding unless dates_marc_encoding.empty?
402
- return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
403
- return nil
404
- end
405
-
406
- def is_number?(object)
407
- true if Integer(object) rescue false
408
- end
409
- def is_date?(object)
410
- true if Date.parse(object) rescue false
411
- end
412
-
413
- # Get the publish year from mods
414
- # @return [String] 4 character year or nil if no valid date was found
415
- def pub_year
416
- #use the cached year if there is one
417
- if @pub_year
418
- if @pub_year == ''
419
- return nil
420
- end
421
- return @pub_year
422
- end
423
- dates = pub_dates
424
- if dates
425
- pruned_dates = []
426
- dates.each do |f_date|
427
- #remove ? and []
428
- if (f_date.length == 4 && f_date.end_with?('?'))
429
- pruned_dates << f_date.gsub('?','0')
430
- else
431
- pruned_dates << f_date.gsub('?','').gsub('[','').gsub(']','')
432
- end
433
- end
434
- #try to find a date starting with the most normal date formats and progressing to more wonky ones
435
- @pub_year = get_plain_four_digit_year pruned_dates
436
- return @pub_year if @pub_year
437
- # Check for years in u notation, e.g., 198u
438
- @pub_year = get_u_year pruned_dates
439
- return @pub_year if @pub_year
440
- @pub_year = get_double_digit_century pruned_dates
441
- return @pub_year if @pub_year
442
- @pub_year = get_bc_year pruned_dates
443
- return @pub_year if @pub_year
444
- @pub_year = get_three_digit_year pruned_dates
445
- return @pub_year if @pub_year
446
- @pub_year = get_single_digit_century pruned_dates
447
- return @pub_year if @pub_year
448
- end
449
- @pub_year=''
450
- return nil
451
- end
452
-
453
- #creates a date suitable for sorting. Guarnteed to be 4 digits or nil
454
- def pub_date_sort
455
- pd=nil
456
- if pub_date
457
- pd=pub_date
458
- if pd.length == 3
459
- pd='0'+pd
460
- end
461
- pd=pd.gsub('--','00')
462
- end
463
- raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4
464
- pd
465
- end
466
-
467
- #The year the object was published, , filtered based on max_pub_date and min_pub_date from the config file
468
- #@return [String] 4 character year or nil
469
- def pub_date
470
- pub_year || nil
471
- end
472
-
473
- #Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
474
- #@return <Array[String]> with values for the pub date facet
475
- def pub_date_facet
476
- if pub_date
477
- if pub_date.start_with?('-')
478
- return (pub_date.to_i + 1000).to_s + ' B.C.'
479
- end
480
- if pub_date.include? '--'
481
- cent=pub_date[0,2].to_i
482
- cent+=1
483
- cent=cent.to_s+'th century'
484
- return cent
485
- else
486
- return pub_date
487
- end
488
- else
489
- nil
490
- end
491
- end
492
-
215
+ # see origin_info.rb (as all this information comes from top level originInfo element)
493
216
  # ---- end PUBLICATION (place, year) ----
494
217
 
495
218
  def sw_logger
@@ -525,23 +248,23 @@ module Stanford
525
248
  when 'still image'
526
249
  val << 'Image'
527
250
  when 'text'
528
- val << 'Book' if issuance and issuance.include? 'monographic'
251
+ val << 'Book' if issuance && issuance.include?('monographic')
529
252
  book_genres = ['book chapter', 'Book chapter', 'Book Chapter',
530
253
  'issue brief', 'Issue brief', 'Issue Brief',
531
254
  'librettos', 'Librettos',
532
255
  'project report', 'Project report', 'Project Report',
533
256
  'technical report', 'Technical report', 'Technical Report',
534
257
  'working paper', 'Working paper', 'Working Paper']
535
- val << 'Book' if genres and !(genres & book_genres).empty?
258
+ val << 'Book' if genres && !(genres & book_genres).empty?
536
259
  conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
537
- val << 'Conference Proceedings' if genres and !(genres & conf_pub).empty?
538
- val << 'Journal/Periodical' if issuance and issuance.include? 'continuing'
260
+ val << 'Conference Proceedings' if genres && !(genres & conf_pub).empty?
261
+ val << 'Journal/Periodical' if issuance && issuance.include?('continuing')
539
262
  article = ['article', 'Article']
540
- val << 'Journal/Periodical' if genres and !(genres & article).empty?
263
+ val << 'Journal/Periodical' if genres && !(genres & article).empty?
541
264
  stu_proj_rpt = ['student project report', 'Student project report', 'Student Project report', 'Student Project Report']
542
- val << 'Other' if genres and !(genres & stu_proj_rpt).empty?
265
+ val << 'Other' if genres && !(genres & stu_proj_rpt).empty?
543
266
  thesis = ['thesis', 'Thesis']
544
- val << 'Thesis' if genres and !(genres & thesis).empty?
267
+ val << 'Thesis' if genres && !(genres & thesis).empty?
545
268
  when 'three dimensional object'
546
269
  val << 'Other'
547
270
  end
@@ -571,7 +294,7 @@ module Stanford
571
294
  ]
572
295
  if types
573
296
  genres = self.term_values(:genre)
574
- issuance = self.term_values([:origin_info,:issuance])
297
+ issuance = self.term_values([:origin_info, :issuance])
575
298
  types.each do |type|
576
299
  case type
577
300
  when 'cartographic'
@@ -583,7 +306,7 @@ module Stanford
583
306
  when 'notated music'
584
307
  val << 'Music score'
585
308
  when 'software, multimedia'
586
- if genres and (genres.include?('dataset') || genres.include?('Dataset'))
309
+ if genres && (genres.include?('dataset') || genres.include?('Dataset'))
587
310
  val << 'Dataset'
588
311
  else
589
312
  val << 'Software/Multimedia'
@@ -595,10 +318,10 @@ module Stanford
595
318
  when 'still image'
596
319
  val << 'Image'
597
320
  when 'text'
598
- val << 'Book' if genres and !(genres & article_genres).empty?
599
- val << 'Book' if issuance and issuance.include? 'monographic'
600
- val << 'Book' if genres and !(genres & book_genres).empty?
601
- val << 'Journal/Periodical' if issuance and issuance.include? 'continuing'
321
+ val << 'Book' if genres && !(genres & article_genres).empty?
322
+ val << 'Book' if issuance && issuance.include?('monographic')
323
+ val << 'Book' if genres && !(genres & book_genres).empty?
324
+ val << 'Journal/Periodical' if issuance && issuance.include?('continuing')
602
325
  when 'three dimensional object'
603
326
  val << 'Object'
604
327
  end
@@ -633,195 +356,21 @@ module Stanford
633
356
 
634
357
  # @return [String] value with the numeric catkey in it, or nil if none exists
635
358
  def catkey
636
- catkey=self.term_values([:record_info,:recordIdentifier])
637
- if catkey and catkey.length>0
638
- return catkey.first.gsub('a','') #need to ensure catkey is numeric only
359
+ catkey = self.term_values([:record_info, :recordIdentifier])
360
+ if catkey && catkey.length > 0
361
+ return catkey.first.tr('a', '') # ensure catkey is numeric only
639
362
  end
640
363
  nil
641
364
  end
642
- def druid= new_druid
643
- @druid=new_druid
644
- end
645
- def druid
646
- @druid ? @druid : 'Unknown item'
647
- end
648
-
649
- # protected ----------------------------------------------------------
650
-
651
- # convenience method for subject/name/namePart values (to avoid parsing the mods for the same thing multiple times)
652
- def subject_names
653
- @subject_names ||= self.sw_subject_names
654
- end
655
-
656
- # convenience method for subject/occupation values (to avoid parsing the mods for the same thing multiple times)
657
- def subject_occupations
658
- @subject_occupations ||= self.term_values([:subject, :occupation])
659
- end
660
-
661
- # convenience method for subject/temporal values (to avoid parsing the mods for the same thing multiple times)
662
- def subject_temporal
663
- @subject_temporal ||= self.term_values([:subject, :temporal])
664
- end
665
365
 
666
- # convenience method for subject/titleInfo values (to avoid parsing the mods for the same thing multiple times)
667
- def subject_titles
668
- @subject_titles ||= self.sw_subject_titles
366
+ def druid=(new_druid)
367
+ @druid = new_druid
669
368
  end
670
369
 
671
- # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
672
- def subject_topics
673
- @subject_topics ||= self.term_values([:subject, :topic])
674
- end
675
-
676
- #get a 4 digit year like 1865 from the date array
677
- def get_plain_four_digit_year dates
678
- dates.each do |f_date|
679
- matches=f_date.scan(/\d{4}/)
680
- if matches.length == 1
681
- @pub_year=matches.first
682
- return matches.first
683
- else
684
- #if there are multiples, check for ones with CE after them
685
- matches.each do |match|
686
- #look for things like '1865-6 CE'
687
- pos = f_date.index(Regexp.new(match+'...CE'))
688
- pos = pos ? pos.to_i : 0
689
- if f_date.include?(match+' CE') or pos > 0
690
- @pub_year=match
691
- return match
692
- end
693
- end
694
- return matches.first
695
- end
696
- end
697
- return nil
698
- end
699
-
700
- # If a year has a "u" in it, replace instances of u with 0
701
- # @param [String] dates
702
- # @return String
703
- def get_u_year dates
704
- dates.each do |f_date|
705
- # Single digit u notation
706
- matches = f_date.scan(/\d{3}u/)
707
- if matches.length == 1
708
- return matches.first.gsub('u','0')
709
- end
710
- # Double digit u notation
711
- matches = f_date.scan(/\d{2}u{2}/)
712
- if matches.length == 1
713
- return matches.first.gsub('u','-')
714
- end
715
- end
716
- return nil
717
- end
718
-
719
- #get a double digit century like '12th century' from the date array
720
- def get_double_digit_century dates
721
- dates.each do |f_date|
722
- matches=f_date.scan(/\d{2}th/)
723
- if matches.length == 1
724
- @pub_year=((matches.first[0,2].to_i)-1).to_s+'--'
725
- return @pub_year
726
- end
727
- #if there are multiples, check for ones with CE after them
728
- if matches.length > 0
729
- matches.each do |match|
730
- pos = f_date.index(Regexp.new(match+'...CE'))
731
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
732
- pos = pos ? pos.to_i : 0
733
- if f_date.include?(match+' CE') or pos > 0
734
- @pub_year=((match[0,2].to_i) - 1).to_s+'--'
735
- return @pub_year
736
- end
737
- end
738
- end
739
- end
740
- return nil
741
- end
742
-
743
- #get a 3 digit year like 965 from the date array
744
- def get_three_digit_year dates
745
- dates.each do |f_date|
746
- matches=f_date.scan(/\d{3}/)
747
- if matches.length > 0
748
- return matches.first
749
- end
750
- end
751
- return nil
752
- end
753
- #get the 3 digit BC year, return it as a negative, so -700 for 300 BC. Other methods will translate it to proper display, this is good for sorting.
754
- def get_bc_year dates
755
- dates.each do |f_date|
756
- matches=f_date.scan(/\d{3} B.C./)
757
- if matches.length > 0
758
- bc_year=matches.first[0..2]
759
- return (bc_year.to_i-1000).to_s
760
- end
761
- end
762
- return nil
763
- end
764
-
765
- #get a single digit century like '9th century' from the date array
766
- def get_single_digit_century dates
767
- dates.each do |f_date|
768
- matches=f_date.scan(/\d{1}th/)
769
- if matches.length == 1
770
- @pub_year=((matches.first[0,2].to_i)-1).to_s+'--'
771
- return @pub_year
772
- end
773
- #if there are multiples, check for ones with CE after them
774
- if matches.length > 0
775
- matches.each do |match|
776
- pos = f_date.index(Regexp.new(match+'...CE'))
777
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
778
- pos = pos ? pos.to_i : 0
779
- if f_date.include?(match+' CE') or pos > 0
780
- @pub_year=((match[0,1].to_i) - 1).to_s+'--'
781
- return @pub_year
782
- end
783
- end
784
- end
785
- end
786
- return nil
787
- end
788
-
789
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
790
- def dates_marc_encoding
791
- @dates_marc_encoding ||= begin
792
- parse_dates_from_originInfo
793
- @dates_marc_encoding
794
- end
795
- end
796
-
797
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
798
- def dates_no_marc_encoding
799
- @dates_no_marc_encoding ||= begin
800
- parse_dates_from_originInfo
801
- @dates_no_marc_encoding
802
- end
370
+ def druid
371
+ @druid ? @druid : 'Unknown item'
803
372
  end
804
373
 
805
- # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
806
- # with and without encoding=marc
807
- def parse_dates_from_originInfo
808
- @dates_marc_encoding = []
809
- @dates_no_marc_encoding = []
810
- self.origin_info.dateIssued.each { |di|
811
- if di.encoding == "marc"
812
- @dates_marc_encoding << di.text
813
- else
814
- @dates_no_marc_encoding << di.text
815
- end
816
- }
817
- self.origin_info.dateCreated.each { |dc|
818
- if dc.encoding == "marc"
819
- @dates_marc_encoding << dc.text
820
- else
821
- @dates_no_marc_encoding << dc.text
822
- end
823
- }
824
- end
825
374
  end # class Record
826
375
  end # Module Mods
827
376
  end # Module Stanford