bpl_enrich 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +32 -0
- data/lib/bpl_enrich/authorities.rb +79 -0
- data/lib/bpl_enrich/constants.rb +5 -0
- data/lib/bpl_enrich/dates.rb +251 -0
- data/lib/bpl_enrich/lcsh.rb +46 -0
- data/lib/bpl_enrich/version.rb +3 -0
- data/lib/bpl_enrich.rb +29 -0
- data/lib/tasks/bpl_enrich_tasks.rake +4 -0
- data/test/authorities_test.rb +48 -0
- data/test/bpl_enrich_test.rb +9 -0
- data/test/dates_test.rb +12 -0
- data/test/dummy/README.rdoc +28 -0
- data/test/dummy/Rakefile +6 -0
- data/test/dummy/app/assets/javascripts/application.js +13 -0
- data/test/dummy/app/assets/stylesheets/application.css +13 -0
- data/test/dummy/app/controllers/application_controller.rb +5 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/app/views/layouts/application.html.erb +14 -0
- data/test/dummy/bin/bundle +3 -0
- data/test/dummy/bin/rails +4 -0
- data/test/dummy/bin/rake +4 -0
- data/test/dummy/config/application.rb +23 -0
- data/test/dummy/config/boot.rb +5 -0
- data/test/dummy/config/database.yml +25 -0
- data/test/dummy/config/environment.rb +5 -0
- data/test/dummy/config/environments/development.rb +29 -0
- data/test/dummy/config/environments/production.rb +80 -0
- data/test/dummy/config/environments/test.rb +36 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
- data/test/dummy/config/initializers/inflections.rb +16 -0
- data/test/dummy/config/initializers/mime_types.rb +5 -0
- data/test/dummy/config/initializers/secret_token.rb +12 -0
- data/test/dummy/config/initializers/session_store.rb +3 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/test/dummy/config/locales/en.yml +23 -0
- data/test/dummy/config/routes.rb +56 -0
- data/test/dummy/config.ru +4 -0
- data/test/dummy/db/test.sqlite3 +0 -0
- data/test/dummy/log/development.log +35 -0
- data/test/dummy/log/test.log +180 -0
- data/test/dummy/public/404.html +58 -0
- data/test/dummy/public/422.html +58 -0
- data/test/dummy/public/500.html +57 -0
- data/test/dummy/public/favicon.ico +0 -0
- data/test/lcsh_test.rb +10 -0
- data/test/test_helper.rb +15 -0
- metadata +214 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 41994a22b0ef539c9938525b17ad475e6eaafce7
|
4
|
+
data.tar.gz: 9c16b1cbb137f1c770d72190b3916a2ac6f703ae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06560416487c4a58d16c0ebed6716429b44aa73bd92c62f6d99093ce4d130a416adf8fa8f5eaeae0e91f2f371fed67535288109265c1a1c8ea44c3690b15d78c
|
7
|
+
data.tar.gz: 54f601dd8de34b18f9c018c09ea48fe1ac4de300e719c07aa34b6f595cbab50c337eecf24d19cde70c2cde2c5f786af492346581027fc30872a1cb3de31db311
|
data/Rakefile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
begin
|
2
|
+
require 'bundler/setup'
|
3
|
+
rescue LoadError
|
4
|
+
puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'rdoc/task'
|
8
|
+
|
9
|
+
RDoc::Task.new(:rdoc) do |rdoc|
|
10
|
+
rdoc.rdoc_dir = 'rdoc'
|
11
|
+
rdoc.title = 'Bplgeo'
|
12
|
+
rdoc.options << '--line-numbers'
|
13
|
+
rdoc.rdoc_files.include('README.rdoc')
|
14
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
Bundler::GemHelper.install_tasks
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
|
24
|
+
Rake::TestTask.new(:test) do |t|
|
25
|
+
t.libs << 'lib'
|
26
|
+
t.libs << 'test'
|
27
|
+
t.pattern = 'test/**/*_test.rb'
|
28
|
+
t.verbose = false
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task default: :test
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class Authorities
|
3
|
+
|
4
|
+
def self.parse_language(language_value)
|
5
|
+
return_hash = {}
|
6
|
+
authority_check = Qa::Authorities::Loc.new
|
7
|
+
authority_result = authority_check.search(URI.escape(language_value), 'iso639-2')
|
8
|
+
|
9
|
+
if authority_result.present?
|
10
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == language_value.downcase || hash['id'].split('/').last.downcase == language_value.downcase }
|
11
|
+
if authority_result.present?
|
12
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
13
|
+
return_hash[:label] = authority_result.first["label"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
return return_hash
|
18
|
+
end
|
19
|
+
|
20
|
+
#TODO: Research why authority_result = authority_check.search(URI.escape('ctb'), 'relators') doesn't work.
|
21
|
+
def self.parse_role(role_value)
|
22
|
+
return_hash = {}
|
23
|
+
authority_check = Qa::Authorities::Loc.new
|
24
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
25
|
+
if authority_result.present?
|
26
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase }
|
27
|
+
if authority_result.present?
|
28
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
29
|
+
return_hash[:label] = authority_result.first["label"]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
return return_hash
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.parse_name_for_role(name)
|
37
|
+
return_hash = {:name=>name}
|
38
|
+
|
39
|
+
#Make sure we have at least three distinct parts of 2-letter+ words. Avoid something like: Steven C. Painter or Painter, Steven C.
|
40
|
+
#Possible Issue: Full name of Steven Carlos Painter ?
|
41
|
+
potential_role_check = name.to_ascii.match(/[\(\"\',]*\w\w+[\),\"\']* [\w\.,\d\-\"]*[\w\d][\w\d][\w\.,\d\-\"]* [\(\"\',]*\w\w+[\),\"\']*$/) || name.split(/[ ]+/).length >= 4
|
42
|
+
|
43
|
+
if potential_role_check.present?
|
44
|
+
authority_check = Qa::Authorities::Loc.new
|
45
|
+
|
46
|
+
#Check the last value of the name string...
|
47
|
+
role_value = name.to_ascii.match(/(?<=[\(\"\', ])\w+(?=[\),\"\']*$)/).to_s
|
48
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
49
|
+
if authority_result.present?
|
50
|
+
|
51
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase}
|
52
|
+
if authority_result.present?
|
53
|
+
#Remove the word and any other characters around it. $ means the end of the line.
|
54
|
+
#
|
55
|
+
return_hash[:name] = name.sub(/[\(\"\', ]*\w+[\),\"\']*$/, '').gsub(/^[ ]*:/, '').strip
|
56
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
57
|
+
return_hash[:label] = authority_result.first["label"]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#Check the last value of the name string...
|
62
|
+
role_value = name.to_ascii.match(/\w+(?=[\),\"\']*)/).to_s
|
63
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
64
|
+
if authority_result.present? && return_hash.blank?
|
65
|
+
|
66
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase}
|
67
|
+
if authority_result.present?
|
68
|
+
#Remove the word and any other characters around it. $ means the end of the line.
|
69
|
+
return_hash[:name] = name.sub(/[\(\"\', ]*\w+[ \),\"\']*/, '').gsub(/^[ ]*:/, '').strip
|
70
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
71
|
+
return_hash[:label] = authority_result.first["label"]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
return return_hash
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,251 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class Dates
|
3
|
+
|
4
|
+
# a function to convert date data from OAI feeds into MODS-usable date data
|
5
|
+
# assumes date values containing ";" have already been split
|
6
|
+
# returns hash with :single_date, :date_range, :date_qualifier, and/or :date_note values
|
7
|
+
def self.standardize(value)
|
8
|
+
|
9
|
+
date_data = {} # create the hash to hold all the data
|
10
|
+
source_date_string = value.strip # variable to hold original value
|
11
|
+
|
12
|
+
# weed out obvious bad dates before processing
|
13
|
+
if (value.match(/([Pp]re|[Pp]ost|[Bb]efore|[Aa]fter|[Uu]nknown|[Uu]ndated|n\.d\.)/)) ||
|
14
|
+
(value.match(/\d\d\d\d-\z/)) || # 1975-
|
15
|
+
(value.match(/\d\d-\d\d\/\d\d/)) || # 1975-09-09/10
|
16
|
+
(value.match(/\d*\(\d*\)/)) || # 1975(1976)
|
17
|
+
(value.scan(/\d\d\d\d/).length > 2) || # 1861/1869/1915
|
18
|
+
(value.scan(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/).length > 1) ||
|
19
|
+
# or if data does not match any of these
|
20
|
+
(!value.match(/(\d\dth [Cc]entury|\d\d\d-\?*|\d\d\d\?|\d\d\?\?|\d\d\d\d)/))
|
21
|
+
date_data[:date_note] = source_date_string
|
22
|
+
else
|
23
|
+
# find date qualifier
|
24
|
+
if value.include? '?'
|
25
|
+
date_data[:date_qualifier] = 'questionable'
|
26
|
+
elsif value.match(/\A[Cc]/)
|
27
|
+
date_data[:date_qualifier] = 'approximate'
|
28
|
+
elsif (value.match(/[\[\]]+/)) || (value.match(/[(][A-Za-z, \d]*[\d]+[A-Za-z, \d]*[)]+/)) # if [] or ()
|
29
|
+
date_data[:date_qualifier] = 'inferred'
|
30
|
+
end
|
31
|
+
|
32
|
+
# remove unnecessary chars and words
|
33
|
+
value = value.gsub(/[\[\]\(\)\.,']/,'')
|
34
|
+
value = value.gsub(/(\b[Bb]etween\b|\bcirca\b|\bca\b|\Aca|\Ac)/,'').strip
|
35
|
+
|
36
|
+
# differentiate between ranges and single dates
|
37
|
+
if (value.scan(/\d\d\d\d/).length == 2) ||
|
38
|
+
(value.include? '0s') || # 1970s
|
39
|
+
(value.include? 'entury') || # 20th century
|
40
|
+
(value.match(/(\A\d\d\d\?|\A\d\d\?\?|\A\d\d\d-\?*|\d\d\d\d-\d\z|\d\d\d\d\/[\d]{1,2}\z)/)) ||
|
41
|
+
(value.match(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/)) ||
|
42
|
+
((value.match(/\d\d\d\d-\d\d\z/)) && (value[-2..-1].to_i > 12)) # 1975-76 but NOT 1910-11
|
43
|
+
|
44
|
+
# RANGES
|
45
|
+
date_data[:date_range] = {}
|
46
|
+
|
47
|
+
# deal with date strings with 2 4-digit year values separately
|
48
|
+
if value.scan(/\d\d\d\d/).length == 2
|
49
|
+
|
50
|
+
# convert weird span indicators ('or','and','||'), remove extraneous text
|
51
|
+
value = value.gsub(/(or|and|\|\|)/,'-').gsub(/[A-Za-z\?\s]/,'')
|
52
|
+
|
53
|
+
if value.match(/\A[12][\d]{3}-[01][\d]-[12][\d]{3}-[01][\d]\z/) # 1895-05-1898-01
|
54
|
+
date_data_range_start = value[0..6]
|
55
|
+
date_data_range_end = value[-7..-1]
|
56
|
+
elsif value.match(/\A[12][\d]{3}\/[12][\d]{3}\z/) # 1987/1988
|
57
|
+
date_data_range_start = value[0..3]
|
58
|
+
date_data_range_end = value[-4..-1]
|
59
|
+
else
|
60
|
+
range_dates = value.split('-') # split the dates into an array
|
61
|
+
range_dates.each_with_index do |range_date,index|
|
62
|
+
# format the data properly
|
63
|
+
if range_date.include? '/' # 11/05/1965
|
64
|
+
range_date_pieces = range_date.split('/')
|
65
|
+
range_date_piece_year = range_date_pieces.last
|
66
|
+
range_date_piece_month = range_date_pieces.first.length == 2 ? range_date_pieces.first : '0' + range_date_pieces.first
|
67
|
+
if range_date_pieces.length == 3
|
68
|
+
range_date_piece_day = range_date_pieces[1].length == 2 ? range_date_pieces[1] : '0' + range_date_pieces[1]
|
69
|
+
end
|
70
|
+
value_to_insert = range_date_piece_year + '-' + range_date_piece_month
|
71
|
+
value_to_insert << '-' + range_date_piece_day if range_date_piece_day
|
72
|
+
elsif range_date.match(/\A[12][\d]{3}\z/)
|
73
|
+
value_to_insert = range_date
|
74
|
+
end
|
75
|
+
# add the data to the proper variable
|
76
|
+
if value_to_insert
|
77
|
+
if index == 0
|
78
|
+
date_data_range_start = value_to_insert
|
79
|
+
else
|
80
|
+
date_data_range_end = value_to_insert
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
else
|
86
|
+
# if there are 'natural language' range values, find, assign to var, then remove
|
87
|
+
text_range = value.match(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/).to_s
|
88
|
+
if text_range.length > 0
|
89
|
+
date_data[:date_qualifier] ||= 'approximate' # TODO - remove this??
|
90
|
+
value = value.gsub(/#{text_range}/,'').strip
|
91
|
+
end
|
92
|
+
|
93
|
+
# deal with ranges for which 'natural language' range values are ignored
|
94
|
+
if value.match(/\A1\d\?\?\z/) # 19??
|
95
|
+
date_data_range_start = value[0..1] + '00'
|
96
|
+
date_data_range_end = value[0..1] + '99'
|
97
|
+
elsif value.match(/\A[12]\d\d-*\?*\z/) # 195? || 195-? || 195-
|
98
|
+
date_data_range_start = value[0..2] + '0'
|
99
|
+
date_data_range_end = value[0..2] + '9'
|
100
|
+
elsif value.match(/\A[12]\d\d\d[-\/][\d]{1,2}\z/) # 1956-57 || 1956/57 || 1956-7
|
101
|
+
if value.length == 7 && (value[5..6].to_i > value[2..3].to_i)
|
102
|
+
date_data_range_start = value[0..3]
|
103
|
+
date_data_range_end = value[0..1] + value[5..6]
|
104
|
+
elsif value.length == 6 && (value[5].to_i > value[3].to_i)
|
105
|
+
date_data_range_start = value[0..3]
|
106
|
+
date_data_range_end = value[0..2] + value[5]
|
107
|
+
end
|
108
|
+
date_data[:date_note] = source_date_string if text_range.length > 0
|
109
|
+
end
|
110
|
+
# deal with ranges where text range values are evaluated
|
111
|
+
value = value.gsub(/\?/,'').strip # remove question marks
|
112
|
+
|
113
|
+
# centuries
|
114
|
+
if value.match(/([12][\d]{1}th [Cc]entury|[12][\d]{1}00s)/) # 19th century || 1800s
|
115
|
+
if value.match(/[12][\d]{1}00s/)
|
116
|
+
century_prefix_date = value.match(/[12][\d]{1}/).to_s
|
117
|
+
else
|
118
|
+
century_prefix_date = (value.match(/[12][\d]{1}/).to_s.to_i-1).to_s
|
119
|
+
end
|
120
|
+
if text_range.match(/([Ee]arly|[Ll]ate|[Mm]id)/)
|
121
|
+
if text_range.match(/[Ee]arly/)
|
122
|
+
century_suffix_dates = %w[00 39]
|
123
|
+
elsif text_range.match(/[Mm]id/)
|
124
|
+
century_suffix_dates = %w[30 69]
|
125
|
+
else
|
126
|
+
century_suffix_dates = %w[60 99]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
date_data_range_start = century_suffix_dates ? century_prefix_date + century_suffix_dates[0] : century_prefix_date + '00'
|
130
|
+
date_data_range_end = century_suffix_dates ? century_prefix_date + century_suffix_dates[1] : century_prefix_date + '99'
|
131
|
+
else
|
132
|
+
# remove any remaining non-date text
|
133
|
+
value.match(/[12][1-9][1-9]0s/) ? is_decade = true : is_decade = false # but preserve decade-ness
|
134
|
+
remaining_text = value.match(/\D+/).to_s
|
135
|
+
value = value.gsub(/#{remaining_text}/,'').strip if remaining_text.length > 0
|
136
|
+
|
137
|
+
# decades
|
138
|
+
if is_decade
|
139
|
+
decade_prefix_date = value.match(/\A[12][1-9][1-9]/).to_s
|
140
|
+
if text_range.match(/([Ee]arly|[Ll]ate|[Mm]id)/)
|
141
|
+
if text_range.match(/[Ee]arly/)
|
142
|
+
decade_suffix_dates = %w[0 3]
|
143
|
+
elsif text_range.match(/[Mm]id/)
|
144
|
+
decade_suffix_dates = %w[4 6]
|
145
|
+
else
|
146
|
+
decade_suffix_dates = %w[7 9]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
date_data_range_start = decade_suffix_dates ? decade_prefix_date + decade_suffix_dates[0] : decade_prefix_date + '0'
|
150
|
+
date_data_range_end = decade_suffix_dates ? decade_prefix_date + decade_suffix_dates[1] : decade_prefix_date + '9'
|
151
|
+
else
|
152
|
+
# single year ranges
|
153
|
+
single_year_prefix = value.match(/[12][0-9]{3}/).to_s
|
154
|
+
if text_range.length > 0
|
155
|
+
if text_range.match(/[Ee]arly/)
|
156
|
+
single_year_suffixes = %w[01 04]
|
157
|
+
elsif text_range.match(/[Mm]id/)
|
158
|
+
single_year_suffixes = %w[05 08]
|
159
|
+
elsif text_range.match(/[Ll]ate/)
|
160
|
+
single_year_suffixes = %w[09 12]
|
161
|
+
elsif text_range.match(/[Ww]inter/)
|
162
|
+
single_year_suffixes = %w[01 03]
|
163
|
+
elsif text_range.match(/[Ss]pring/)
|
164
|
+
single_year_suffixes = %w[03 05]
|
165
|
+
elsif text_range.match(/[Ss]ummer/)
|
166
|
+
single_year_suffixes = %w[06 08]
|
167
|
+
else text_range.match(/[F]all/)
|
168
|
+
single_year_suffixes = %w[09 11]
|
169
|
+
end
|
170
|
+
date_data_range_start = single_year_prefix + '-' + single_year_suffixes[0]
|
171
|
+
date_data_range_end = single_year_prefix + '-' + single_year_suffixes[1]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
# if possibly significant info removed, include as note
|
175
|
+
date_data[:date_note] = source_date_string if remaining_text.length > 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# insert the values into the date_data hash
|
180
|
+
if date_data_range_start && date_data_range_end
|
181
|
+
date_data[:date_range][:start] = date_data_range_start
|
182
|
+
date_data[:date_range][:end] = date_data_range_end
|
183
|
+
else
|
184
|
+
date_data[:date_note] ||= source_date_string
|
185
|
+
date_data.delete :date_range
|
186
|
+
end
|
187
|
+
|
188
|
+
else
|
189
|
+
# SINGLE DATES
|
190
|
+
value = value.gsub(/\?/,'') # remove question marks
|
191
|
+
# fix bad spacing (e.g. December 13,1985 || December 3,1985)
|
192
|
+
value = value.insert(-5, ' ') if value.match(/[A-Za-z]* \d{6}/) || value.match(/[A-Za-z]* \d{5}/)
|
193
|
+
|
194
|
+
# try to automatically parse single dates with YYYY && MM && DD values
|
195
|
+
if Timeliness.parse(value).nil?
|
196
|
+
# start further processing
|
197
|
+
if value.match(/\A[12]\d\d\d-[01][0-9]\z/) # yyyy-mm
|
198
|
+
date_data[:single_date] = value
|
199
|
+
elsif value.match(/\A[01]?[1-9][-\/][12]\d\d\d\z/) # mm-yyyy || m-yyyy || mm/yyyy
|
200
|
+
value = '0' + value if value.match(/\A[1-9][-\/][12]\d\d\d\z/) # m-yyyy || m/yyyy
|
201
|
+
date_data[:single_date] = value[3..6] + '-' + value[0..1]
|
202
|
+
elsif value.match(/\A[A-Za-z]{3,} [12]\d\d\d\z/) # April 1987 || Apr. 1987
|
203
|
+
value = value.split(' ')
|
204
|
+
if value[0].length == 3
|
205
|
+
value_month = '%02d' % Date::ABBR_MONTHNAMES.index(value[0])
|
206
|
+
else
|
207
|
+
value_month = '%02d' % Date::MONTHNAMES.index(value[0])
|
208
|
+
end
|
209
|
+
date_data[:single_date] = value_month ? value[1] + '-' + value_month : value[1]
|
210
|
+
elsif value.match(/\A[12]\d\d\d\z/) # 1999
|
211
|
+
date_data[:single_date] = value
|
212
|
+
else
|
213
|
+
date_data[:date_note] = source_date_string
|
214
|
+
end
|
215
|
+
else
|
216
|
+
date_data[:single_date] = Timeliness.parse(value).strftime("%Y-%m-%d")
|
217
|
+
end
|
218
|
+
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
# some final validation, just in case
|
224
|
+
date_validation_array = []
|
225
|
+
date_validation_array << date_data[:single_date] if date_data[:single_date]
|
226
|
+
date_validation_array << date_data[:date_range][:start] if date_data[:date_range]
|
227
|
+
date_validation_array << date_data[:date_range][:end] if date_data[:date_range]
|
228
|
+
date_validation_array.each do |date_to_val|
|
229
|
+
if date_to_val.length == '7'
|
230
|
+
bad_date = true unless date_to_val[-2..-1].to_i.between?(1,12) && !date_to_val.nil?
|
231
|
+
elsif
|
232
|
+
date_to_val.length == '10'
|
233
|
+
bad_date = true unless Timeliness.parse(value) && !date_to_val.nil?
|
234
|
+
end
|
235
|
+
if bad_date
|
236
|
+
date_data[:date_note] ||= source_date_string
|
237
|
+
date_data.delete :single_date if date_data[:single_date]
|
238
|
+
date_data.delete :date_range if date_data[:date_range]
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# if the date slipped by all the processing somehow!
|
243
|
+
if date_data[:single_date].nil? && date_data[:date_range].nil? && date_data[:date_note].nil?
|
244
|
+
date_data[:date_note] = source_date_string
|
245
|
+
end
|
246
|
+
|
247
|
+
date_data
|
248
|
+
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class LCSH
|
3
|
+
|
4
|
+
#Take LCSH subjects and make them standard.
|
5
|
+
def self.standardize(value)
|
6
|
+
|
7
|
+
if value.blank?
|
8
|
+
return ''
|
9
|
+
end
|
10
|
+
|
11
|
+
#Remove stuff that is quoted (quotation for first and last words)..
|
12
|
+
value = value.gsub(/^['"]/, '').gsub(/['"]$/, '').strip
|
13
|
+
|
14
|
+
#Remove ending periods ... except when an initial or etc.
|
15
|
+
if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
|
16
|
+
value = value.slice(0..-2)
|
17
|
+
end
|
18
|
+
|
19
|
+
#Fix when '- -' occurs
|
20
|
+
value = value.gsub(/-\s-/,'--')
|
21
|
+
|
22
|
+
#Fix for "em" dashes - two types?
|
23
|
+
value = value.gsub('—','--')
|
24
|
+
|
25
|
+
#Fix for "em" dashes - two types?
|
26
|
+
value = value.gsub('–','--')
|
27
|
+
|
28
|
+
#Fix for ' - ' combinations
|
29
|
+
value = value.gsub(' - ','--')
|
30
|
+
|
31
|
+
#Remove white space after and before '--'
|
32
|
+
value = value.gsub(/\s+--/,'--')
|
33
|
+
value = value.gsub(/--\s+/,'--')
|
34
|
+
|
35
|
+
#Ensure first work is capitalized
|
36
|
+
value[0] = value.first.capitalize[0]
|
37
|
+
|
38
|
+
#Strip an white space
|
39
|
+
value = BplEnrich.strip_value(value)
|
40
|
+
|
41
|
+
return value
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
data/lib/bpl_enrich.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
require "bpl_enrich/lcsh"
|
3
|
+
require "bpl_enrich/dates"
|
4
|
+
require "bpl_enrich/constants"
|
5
|
+
require "bpl_enrich/authorities"
|
6
|
+
require "timeliness"
|
7
|
+
require "unidecoder"
|
8
|
+
require "htmlentities"
|
9
|
+
require "qa"
|
10
|
+
|
11
|
+
def self.strip_value(value)
|
12
|
+
if(value.blank?)
|
13
|
+
return nil
|
14
|
+
else
|
15
|
+
if value.class == Float || value.class == Fixnum
|
16
|
+
value = value.to_i.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
# Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
|
20
|
+
return utf8Encode(value)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#TODO: Better name for this. Should be part of an overall helped gem.
|
25
|
+
def self.utf8Encode(value)
|
26
|
+
return ::HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class AuthoritiesTest < ActiveSupport::TestCase
|
4
|
+
def test_parse_language
|
5
|
+
result = BplEnrich::Authorities.parse_language('eng')
|
6
|
+
assert_equal 'English', result[:label]
|
7
|
+
assert_equal 'http://id.loc.gov/vocabulary/iso639-2/eng', result[:uri]
|
8
|
+
|
9
|
+
result = BplEnrich::Authorities.parse_language('English')
|
10
|
+
assert_equal 'English', result[:label]
|
11
|
+
assert_equal 'http://id.loc.gov/vocabulary/iso639-2/eng', result[:uri]
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_parse_role
|
15
|
+
|
16
|
+
result = BplEnrich::Authorities.parse_role('Contributor')
|
17
|
+
assert_equal 'Contributor', result[:label]
|
18
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
19
|
+
|
20
|
+
#FIXME: Using URI doesn't seem to work in this vocab?
|
21
|
+
#result = BplEnrich::Authorities.parse_role('ctb')
|
22
|
+
#assert_equal 'Contributor', result[:label]
|
23
|
+
#assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_parse_name_for_role
|
27
|
+
|
28
|
+
result = BplEnrich::Authorities.parse_name_for_role('Steven Anderson (Contributor)')
|
29
|
+
assert_equal 'Steven Anderson', result[:name]
|
30
|
+
assert_equal 'Contributor', result[:label]
|
31
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
32
|
+
|
33
|
+
result = BplEnrich::Authorities.parse_name_for_role('Steven Anderson (Painter)')
|
34
|
+
assert_equal 'Steven Anderson (Painter)', result[:name]
|
35
|
+
assert_equal nil, result[:label]
|
36
|
+
assert_equal nil, result[:uri]
|
37
|
+
|
38
|
+
#Special non-Ascii character check
|
39
|
+
result = BplEnrich::Authorities.parse_name_for_role('Sully, François (Photographer)')
|
40
|
+
assert_equal 'Sully, François', result[:name]
|
41
|
+
assert_equal 'Photographer', result[:label]
|
42
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/pht', result[:uri]
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
data/test/dates_test.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class DatesTest < ActiveSupport::TestCase
|
4
|
+
def test_date_standardizer
|
5
|
+
result = BplEnrich::Dates.standardize('April 1983')
|
6
|
+
assert_equal '1983-04', result[:single_date]
|
7
|
+
assert_equal nil, result[:date_range]
|
8
|
+
assert_equal nil, result[:date_note]
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
== README
|
2
|
+
|
3
|
+
This README would normally document whatever steps are necessary to get the
|
4
|
+
application up and running.
|
5
|
+
|
6
|
+
Things you may want to cover:
|
7
|
+
|
8
|
+
* Ruby version
|
9
|
+
|
10
|
+
* System dependencies
|
11
|
+
|
12
|
+
* Configuration
|
13
|
+
|
14
|
+
* Database creation
|
15
|
+
|
16
|
+
* Database initialization
|
17
|
+
|
18
|
+
* How to run the test suite
|
19
|
+
|
20
|
+
* Services (job queues, cache servers, search engines, etc.)
|
21
|
+
|
22
|
+
* Deployment instructions
|
23
|
+
|
24
|
+
* ...
|
25
|
+
|
26
|
+
|
27
|
+
Please feel free to use a different markup language if you do not plan to run
|
28
|
+
<tt>rake doc:app</tt>.
|
data/test/dummy/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
+
// listed below.
|
3
|
+
//
|
4
|
+
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
+
// or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path.
|
6
|
+
//
|
7
|
+
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
+
// compiled file.
|
9
|
+
//
|
10
|
+
// Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details
|
11
|
+
// about supported directives.
|
12
|
+
//
|
13
|
+
//= require_tree .
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
+
* listed below.
|
4
|
+
*
|
5
|
+
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
+
* or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path.
|
7
|
+
*
|
8
|
+
* You're free to add application-wide styles to this file and they'll appear at the top of the
|
9
|
+
* compiled file, but it's generally better to create a new file per style scope.
|
10
|
+
*
|
11
|
+
*= require_self
|
12
|
+
*= require_tree .
|
13
|
+
*/
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Dummy</title>
|
5
|
+
<%= stylesheet_link_tag "application", media: "all", "data-turbolinks-track" => true %>
|
6
|
+
<%= javascript_include_tag "application", "data-turbolinks-track" => true %>
|
7
|
+
<%= csrf_meta_tags %>
|
8
|
+
</head>
|
9
|
+
<body>
|
10
|
+
|
11
|
+
<%= yield %>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
data/test/dummy/bin/rake
ADDED