bpl_enrich 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +32 -0
- data/lib/bpl_enrich/authorities.rb +79 -0
- data/lib/bpl_enrich/constants.rb +5 -0
- data/lib/bpl_enrich/dates.rb +251 -0
- data/lib/bpl_enrich/lcsh.rb +46 -0
- data/lib/bpl_enrich/version.rb +3 -0
- data/lib/bpl_enrich.rb +29 -0
- data/lib/tasks/bpl_enrich_tasks.rake +4 -0
- data/test/authorities_test.rb +48 -0
- data/test/bpl_enrich_test.rb +9 -0
- data/test/dates_test.rb +12 -0
- data/test/dummy/README.rdoc +28 -0
- data/test/dummy/Rakefile +6 -0
- data/test/dummy/app/assets/javascripts/application.js +13 -0
- data/test/dummy/app/assets/stylesheets/application.css +13 -0
- data/test/dummy/app/controllers/application_controller.rb +5 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/app/views/layouts/application.html.erb +14 -0
- data/test/dummy/bin/bundle +3 -0
- data/test/dummy/bin/rails +4 -0
- data/test/dummy/bin/rake +4 -0
- data/test/dummy/config/application.rb +23 -0
- data/test/dummy/config/boot.rb +5 -0
- data/test/dummy/config/database.yml +25 -0
- data/test/dummy/config/environment.rb +5 -0
- data/test/dummy/config/environments/development.rb +29 -0
- data/test/dummy/config/environments/production.rb +80 -0
- data/test/dummy/config/environments/test.rb +36 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
- data/test/dummy/config/initializers/inflections.rb +16 -0
- data/test/dummy/config/initializers/mime_types.rb +5 -0
- data/test/dummy/config/initializers/secret_token.rb +12 -0
- data/test/dummy/config/initializers/session_store.rb +3 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/test/dummy/config/locales/en.yml +23 -0
- data/test/dummy/config/routes.rb +56 -0
- data/test/dummy/config.ru +4 -0
- data/test/dummy/db/test.sqlite3 +0 -0
- data/test/dummy/log/development.log +35 -0
- data/test/dummy/log/test.log +180 -0
- data/test/dummy/public/404.html +58 -0
- data/test/dummy/public/422.html +58 -0
- data/test/dummy/public/500.html +57 -0
- data/test/dummy/public/favicon.ico +0 -0
- data/test/lcsh_test.rb +10 -0
- data/test/test_helper.rb +15 -0
- metadata +214 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 41994a22b0ef539c9938525b17ad475e6eaafce7
|
4
|
+
data.tar.gz: 9c16b1cbb137f1c770d72190b3916a2ac6f703ae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06560416487c4a58d16c0ebed6716429b44aa73bd92c62f6d99093ce4d130a416adf8fa8f5eaeae0e91f2f371fed67535288109265c1a1c8ea44c3690b15d78c
|
7
|
+
data.tar.gz: 54f601dd8de34b18f9c018c09ea48fe1ac4de300e719c07aa34b6f595cbab50c337eecf24d19cde70c2cde2c5f786af492346581027fc30872a1cb3de31db311
|
data/Rakefile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
begin
|
2
|
+
require 'bundler/setup'
|
3
|
+
rescue LoadError
|
4
|
+
puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'rdoc/task'
|
8
|
+
|
9
|
+
RDoc::Task.new(:rdoc) do |rdoc|
|
10
|
+
rdoc.rdoc_dir = 'rdoc'
|
11
|
+
rdoc.title = 'Bplgeo'
|
12
|
+
rdoc.options << '--line-numbers'
|
13
|
+
rdoc.rdoc_files.include('README.rdoc')
|
14
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
Bundler::GemHelper.install_tasks
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
|
24
|
+
Rake::TestTask.new(:test) do |t|
|
25
|
+
t.libs << 'lib'
|
26
|
+
t.libs << 'test'
|
27
|
+
t.pattern = 'test/**/*_test.rb'
|
28
|
+
t.verbose = false
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task default: :test
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class Authorities
|
3
|
+
|
4
|
+
def self.parse_language(language_value)
|
5
|
+
return_hash = {}
|
6
|
+
authority_check = Qa::Authorities::Loc.new
|
7
|
+
authority_result = authority_check.search(URI.escape(language_value), 'iso639-2')
|
8
|
+
|
9
|
+
if authority_result.present?
|
10
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == language_value.downcase || hash['id'].split('/').last.downcase == language_value.downcase }
|
11
|
+
if authority_result.present?
|
12
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
13
|
+
return_hash[:label] = authority_result.first["label"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
return return_hash
|
18
|
+
end
|
19
|
+
|
20
|
+
#TODO: Research why authority_result = authority_check.search(URI.escape('ctb'), 'relators') doesn't work.
|
21
|
+
def self.parse_role(role_value)
|
22
|
+
return_hash = {}
|
23
|
+
authority_check = Qa::Authorities::Loc.new
|
24
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
25
|
+
if authority_result.present?
|
26
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase }
|
27
|
+
if authority_result.present?
|
28
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
29
|
+
return_hash[:label] = authority_result.first["label"]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
return return_hash
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.parse_name_for_role(name)
|
37
|
+
return_hash = {:name=>name}
|
38
|
+
|
39
|
+
#Make sure we have at least three distinct parts of 2-letter+ words. Avoid something like: Steven C. Painter or Painter, Steven C.
|
40
|
+
#Possible Issue: Full name of Steven Carlos Painter ?
|
41
|
+
potential_role_check = name.to_ascii.match(/[\(\"\',]*\w\w+[\),\"\']* [\w\.,\d\-\"]*[\w\d][\w\d][\w\.,\d\-\"]* [\(\"\',]*\w\w+[\),\"\']*$/) || name.split(/[ ]+/).length >= 4
|
42
|
+
|
43
|
+
if potential_role_check.present?
|
44
|
+
authority_check = Qa::Authorities::Loc.new
|
45
|
+
|
46
|
+
#Check the last value of the name string...
|
47
|
+
role_value = name.to_ascii.match(/(?<=[\(\"\', ])\w+(?=[\),\"\']*$)/).to_s
|
48
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
49
|
+
if authority_result.present?
|
50
|
+
|
51
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase}
|
52
|
+
if authority_result.present?
|
53
|
+
#Remove the word and any other characters around it. $ means the end of the line.
|
54
|
+
#
|
55
|
+
return_hash[:name] = name.sub(/[\(\"\', ]*\w+[\),\"\']*$/, '').gsub(/^[ ]*:/, '').strip
|
56
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
57
|
+
return_hash[:label] = authority_result.first["label"]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#Check the last value of the name string...
|
62
|
+
role_value = name.to_ascii.match(/\w+(?=[\),\"\']*)/).to_s
|
63
|
+
authority_result = authority_check.search(URI.escape(role_value), 'relators')
|
64
|
+
if authority_result.present? && return_hash.blank?
|
65
|
+
|
66
|
+
authority_result = authority_result.select{|hash| hash['label'].downcase == role_value.downcase}
|
67
|
+
if authority_result.present?
|
68
|
+
#Remove the word and any other characters around it. $ means the end of the line.
|
69
|
+
return_hash[:name] = name.sub(/[\(\"\', ]*\w+[ \),\"\']*/, '').gsub(/^[ ]*:/, '').strip
|
70
|
+
return_hash[:uri] = authority_result.first["id"].gsub('info:lc', 'http://id.loc.gov')
|
71
|
+
return_hash[:label] = authority_result.first["label"]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
return return_hash
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,251 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class Dates
|
3
|
+
|
4
|
+
# a function to convert date data from OAI feeds into MODS-usable date data
|
5
|
+
# assumes date values containing ";" have already been split
|
6
|
+
# returns hash with :single_date, :date_range, :date_qualifier, and/or :date_note values
|
7
|
+
def self.standardize(value)
|
8
|
+
|
9
|
+
date_data = {} # create the hash to hold all the data
|
10
|
+
source_date_string = value.strip # variable to hold original value
|
11
|
+
|
12
|
+
# weed out obvious bad dates before processing
|
13
|
+
if (value.match(/([Pp]re|[Pp]ost|[Bb]efore|[Aa]fter|[Uu]nknown|[Uu]ndated|n\.d\.)/)) ||
|
14
|
+
(value.match(/\d\d\d\d-\z/)) || # 1975-
|
15
|
+
(value.match(/\d\d-\d\d\/\d\d/)) || # 1975-09-09/10
|
16
|
+
(value.match(/\d*\(\d*\)/)) || # 1975(1976)
|
17
|
+
(value.scan(/\d\d\d\d/).length > 2) || # 1861/1869/1915
|
18
|
+
(value.scan(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/).length > 1) ||
|
19
|
+
# or if data does not match any of these
|
20
|
+
(!value.match(/(\d\dth [Cc]entury|\d\d\d-\?*|\d\d\d\?|\d\d\?\?|\d\d\d\d)/))
|
21
|
+
date_data[:date_note] = source_date_string
|
22
|
+
else
|
23
|
+
# find date qualifier
|
24
|
+
if value.include? '?'
|
25
|
+
date_data[:date_qualifier] = 'questionable'
|
26
|
+
elsif value.match(/\A[Cc]/)
|
27
|
+
date_data[:date_qualifier] = 'approximate'
|
28
|
+
elsif (value.match(/[\[\]]+/)) || (value.match(/[(][A-Za-z, \d]*[\d]+[A-Za-z, \d]*[)]+/)) # if [] or ()
|
29
|
+
date_data[:date_qualifier] = 'inferred'
|
30
|
+
end
|
31
|
+
|
32
|
+
# remove unnecessary chars and words
|
33
|
+
value = value.gsub(/[\[\]\(\)\.,']/,'')
|
34
|
+
value = value.gsub(/(\b[Bb]etween\b|\bcirca\b|\bca\b|\Aca|\Ac)/,'').strip
|
35
|
+
|
36
|
+
# differentiate between ranges and single dates
|
37
|
+
if (value.scan(/\d\d\d\d/).length == 2) ||
|
38
|
+
(value.include? '0s') || # 1970s
|
39
|
+
(value.include? 'entury') || # 20th century
|
40
|
+
(value.match(/(\A\d\d\d\?|\A\d\d\?\?|\A\d\d\d-\?*|\d\d\d\d-\d\z|\d\d\d\d\/[\d]{1,2}\z)/)) ||
|
41
|
+
(value.match(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/)) ||
|
42
|
+
((value.match(/\d\d\d\d-\d\d\z/)) && (value[-2..-1].to_i > 12)) # 1975-76 but NOT 1910-11
|
43
|
+
|
44
|
+
# RANGES
|
45
|
+
date_data[:date_range] = {}
|
46
|
+
|
47
|
+
# deal with date strings with 2 4-digit year values separately
|
48
|
+
if value.scan(/\d\d\d\d/).length == 2
|
49
|
+
|
50
|
+
# convert weird span indicators ('or','and','||'), remove extraneous text
|
51
|
+
value = value.gsub(/(or|and|\|\|)/,'-').gsub(/[A-Za-z\?\s]/,'')
|
52
|
+
|
53
|
+
if value.match(/\A[12][\d]{3}-[01][\d]-[12][\d]{3}-[01][\d]\z/) # 1895-05-1898-01
|
54
|
+
date_data_range_start = value[0..6]
|
55
|
+
date_data_range_end = value[-7..-1]
|
56
|
+
elsif value.match(/\A[12][\d]{3}\/[12][\d]{3}\z/) # 1987/1988
|
57
|
+
date_data_range_start = value[0..3]
|
58
|
+
date_data_range_end = value[-4..-1]
|
59
|
+
else
|
60
|
+
range_dates = value.split('-') # split the dates into an array
|
61
|
+
range_dates.each_with_index do |range_date,index|
|
62
|
+
# format the data properly
|
63
|
+
if range_date.include? '/' # 11/05/1965
|
64
|
+
range_date_pieces = range_date.split('/')
|
65
|
+
range_date_piece_year = range_date_pieces.last
|
66
|
+
range_date_piece_month = range_date_pieces.first.length == 2 ? range_date_pieces.first : '0' + range_date_pieces.first
|
67
|
+
if range_date_pieces.length == 3
|
68
|
+
range_date_piece_day = range_date_pieces[1].length == 2 ? range_date_pieces[1] : '0' + range_date_pieces[1]
|
69
|
+
end
|
70
|
+
value_to_insert = range_date_piece_year + '-' + range_date_piece_month
|
71
|
+
value_to_insert << '-' + range_date_piece_day if range_date_piece_day
|
72
|
+
elsif range_date.match(/\A[12][\d]{3}\z/)
|
73
|
+
value_to_insert = range_date
|
74
|
+
end
|
75
|
+
# add the data to the proper variable
|
76
|
+
if value_to_insert
|
77
|
+
if index == 0
|
78
|
+
date_data_range_start = value_to_insert
|
79
|
+
else
|
80
|
+
date_data_range_end = value_to_insert
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
else
|
86
|
+
# if there are 'natural language' range values, find, assign to var, then remove
|
87
|
+
text_range = value.match(/([Ee]arly|[Ll]ate|[Mm]id|[Ww]inter|[Ss]pring|[Ss]ummer|[Ff]all)/).to_s
|
88
|
+
if text_range.length > 0
|
89
|
+
date_data[:date_qualifier] ||= 'approximate' # TODO - remove this??
|
90
|
+
value = value.gsub(/#{text_range}/,'').strip
|
91
|
+
end
|
92
|
+
|
93
|
+
# deal with ranges for which 'natural language' range values are ignored
|
94
|
+
if value.match(/\A1\d\?\?\z/) # 19??
|
95
|
+
date_data_range_start = value[0..1] + '00'
|
96
|
+
date_data_range_end = value[0..1] + '99'
|
97
|
+
elsif value.match(/\A[12]\d\d-*\?*\z/) # 195? || 195-? || 195-
|
98
|
+
date_data_range_start = value[0..2] + '0'
|
99
|
+
date_data_range_end = value[0..2] + '9'
|
100
|
+
elsif value.match(/\A[12]\d\d\d[-\/][\d]{1,2}\z/) # 1956-57 || 1956/57 || 1956-7
|
101
|
+
if value.length == 7 && (value[5..6].to_i > value[2..3].to_i)
|
102
|
+
date_data_range_start = value[0..3]
|
103
|
+
date_data_range_end = value[0..1] + value[5..6]
|
104
|
+
elsif value.length == 6 && (value[5].to_i > value[3].to_i)
|
105
|
+
date_data_range_start = value[0..3]
|
106
|
+
date_data_range_end = value[0..2] + value[5]
|
107
|
+
end
|
108
|
+
date_data[:date_note] = source_date_string if text_range.length > 0
|
109
|
+
end
|
110
|
+
# deal with ranges where text range values are evaluated
|
111
|
+
value = value.gsub(/\?/,'').strip # remove question marks
|
112
|
+
|
113
|
+
# centuries
|
114
|
+
if value.match(/([12][\d]{1}th [Cc]entury|[12][\d]{1}00s)/) # 19th century || 1800s
|
115
|
+
if value.match(/[12][\d]{1}00s/)
|
116
|
+
century_prefix_date = value.match(/[12][\d]{1}/).to_s
|
117
|
+
else
|
118
|
+
century_prefix_date = (value.match(/[12][\d]{1}/).to_s.to_i-1).to_s
|
119
|
+
end
|
120
|
+
if text_range.match(/([Ee]arly|[Ll]ate|[Mm]id)/)
|
121
|
+
if text_range.match(/[Ee]arly/)
|
122
|
+
century_suffix_dates = %w[00 39]
|
123
|
+
elsif text_range.match(/[Mm]id/)
|
124
|
+
century_suffix_dates = %w[30 69]
|
125
|
+
else
|
126
|
+
century_suffix_dates = %w[60 99]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
date_data_range_start = century_suffix_dates ? century_prefix_date + century_suffix_dates[0] : century_prefix_date + '00'
|
130
|
+
date_data_range_end = century_suffix_dates ? century_prefix_date + century_suffix_dates[1] : century_prefix_date + '99'
|
131
|
+
else
|
132
|
+
# remove any remaining non-date text
|
133
|
+
value.match(/[12][1-9][1-9]0s/) ? is_decade = true : is_decade = false # but preserve decade-ness
|
134
|
+
remaining_text = value.match(/\D+/).to_s
|
135
|
+
value = value.gsub(/#{remaining_text}/,'').strip if remaining_text.length > 0
|
136
|
+
|
137
|
+
# decades
|
138
|
+
if is_decade
|
139
|
+
decade_prefix_date = value.match(/\A[12][1-9][1-9]/).to_s
|
140
|
+
if text_range.match(/([Ee]arly|[Ll]ate|[Mm]id)/)
|
141
|
+
if text_range.match(/[Ee]arly/)
|
142
|
+
decade_suffix_dates = %w[0 3]
|
143
|
+
elsif text_range.match(/[Mm]id/)
|
144
|
+
decade_suffix_dates = %w[4 6]
|
145
|
+
else
|
146
|
+
decade_suffix_dates = %w[7 9]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
date_data_range_start = decade_suffix_dates ? decade_prefix_date + decade_suffix_dates[0] : decade_prefix_date + '0'
|
150
|
+
date_data_range_end = decade_suffix_dates ? decade_prefix_date + decade_suffix_dates[1] : decade_prefix_date + '9'
|
151
|
+
else
|
152
|
+
# single year ranges
|
153
|
+
single_year_prefix = value.match(/[12][0-9]{3}/).to_s
|
154
|
+
if text_range.length > 0
|
155
|
+
if text_range.match(/[Ee]arly/)
|
156
|
+
single_year_suffixes = %w[01 04]
|
157
|
+
elsif text_range.match(/[Mm]id/)
|
158
|
+
single_year_suffixes = %w[05 08]
|
159
|
+
elsif text_range.match(/[Ll]ate/)
|
160
|
+
single_year_suffixes = %w[09 12]
|
161
|
+
elsif text_range.match(/[Ww]inter/)
|
162
|
+
single_year_suffixes = %w[01 03]
|
163
|
+
elsif text_range.match(/[Ss]pring/)
|
164
|
+
single_year_suffixes = %w[03 05]
|
165
|
+
elsif text_range.match(/[Ss]ummer/)
|
166
|
+
single_year_suffixes = %w[06 08]
|
167
|
+
else text_range.match(/[F]all/)
|
168
|
+
single_year_suffixes = %w[09 11]
|
169
|
+
end
|
170
|
+
date_data_range_start = single_year_prefix + '-' + single_year_suffixes[0]
|
171
|
+
date_data_range_end = single_year_prefix + '-' + single_year_suffixes[1]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
# if possibly significant info removed, include as note
|
175
|
+
date_data[:date_note] = source_date_string if remaining_text.length > 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# insert the values into the date_data hash
|
180
|
+
if date_data_range_start && date_data_range_end
|
181
|
+
date_data[:date_range][:start] = date_data_range_start
|
182
|
+
date_data[:date_range][:end] = date_data_range_end
|
183
|
+
else
|
184
|
+
date_data[:date_note] ||= source_date_string
|
185
|
+
date_data.delete :date_range
|
186
|
+
end
|
187
|
+
|
188
|
+
else
|
189
|
+
# SINGLE DATES
|
190
|
+
value = value.gsub(/\?/,'') # remove question marks
|
191
|
+
# fix bad spacing (e.g. December 13,1985 || December 3,1985)
|
192
|
+
value = value.insert(-5, ' ') if value.match(/[A-Za-z]* \d{6}/) || value.match(/[A-Za-z]* \d{5}/)
|
193
|
+
|
194
|
+
# try to automatically parse single dates with YYYY && MM && DD values
|
195
|
+
if Timeliness.parse(value).nil?
|
196
|
+
# start further processing
|
197
|
+
if value.match(/\A[12]\d\d\d-[01][0-9]\z/) # yyyy-mm
|
198
|
+
date_data[:single_date] = value
|
199
|
+
elsif value.match(/\A[01]?[1-9][-\/][12]\d\d\d\z/) # mm-yyyy || m-yyyy || mm/yyyy
|
200
|
+
value = '0' + value if value.match(/\A[1-9][-\/][12]\d\d\d\z/) # m-yyyy || m/yyyy
|
201
|
+
date_data[:single_date] = value[3..6] + '-' + value[0..1]
|
202
|
+
elsif value.match(/\A[A-Za-z]{3,} [12]\d\d\d\z/) # April 1987 || Apr. 1987
|
203
|
+
value = value.split(' ')
|
204
|
+
if value[0].length == 3
|
205
|
+
value_month = '%02d' % Date::ABBR_MONTHNAMES.index(value[0])
|
206
|
+
else
|
207
|
+
value_month = '%02d' % Date::MONTHNAMES.index(value[0])
|
208
|
+
end
|
209
|
+
date_data[:single_date] = value_month ? value[1] + '-' + value_month : value[1]
|
210
|
+
elsif value.match(/\A[12]\d\d\d\z/) # 1999
|
211
|
+
date_data[:single_date] = value
|
212
|
+
else
|
213
|
+
date_data[:date_note] = source_date_string
|
214
|
+
end
|
215
|
+
else
|
216
|
+
date_data[:single_date] = Timeliness.parse(value).strftime("%Y-%m-%d")
|
217
|
+
end
|
218
|
+
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
# some final validation, just in case
|
224
|
+
date_validation_array = []
|
225
|
+
date_validation_array << date_data[:single_date] if date_data[:single_date]
|
226
|
+
date_validation_array << date_data[:date_range][:start] if date_data[:date_range]
|
227
|
+
date_validation_array << date_data[:date_range][:end] if date_data[:date_range]
|
228
|
+
date_validation_array.each do |date_to_val|
|
229
|
+
if date_to_val.length == '7'
|
230
|
+
bad_date = true unless date_to_val[-2..-1].to_i.between?(1,12) && !date_to_val.nil?
|
231
|
+
elsif
|
232
|
+
date_to_val.length == '10'
|
233
|
+
bad_date = true unless Timeliness.parse(value) && !date_to_val.nil?
|
234
|
+
end
|
235
|
+
if bad_date
|
236
|
+
date_data[:date_note] ||= source_date_string
|
237
|
+
date_data.delete :single_date if date_data[:single_date]
|
238
|
+
date_data.delete :date_range if date_data[:date_range]
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# if the date slipped by all the processing somehow!
|
243
|
+
if date_data[:single_date].nil? && date_data[:date_range].nil? && date_data[:date_note].nil?
|
244
|
+
date_data[:date_note] = source_date_string
|
245
|
+
end
|
246
|
+
|
247
|
+
date_data
|
248
|
+
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
class LCSH
|
3
|
+
|
4
|
+
#Take LCSH subjects and make them standard.
|
5
|
+
def self.standardize(value)
|
6
|
+
|
7
|
+
if value.blank?
|
8
|
+
return ''
|
9
|
+
end
|
10
|
+
|
11
|
+
#Remove stuff that is quoted (quotation for first and last words)..
|
12
|
+
value = value.gsub(/^['"]/, '').gsub(/['"]$/, '').strip
|
13
|
+
|
14
|
+
#Remove ending periods ... except when an initial or etc.
|
15
|
+
if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
|
16
|
+
value = value.slice(0..-2)
|
17
|
+
end
|
18
|
+
|
19
|
+
#Fix when '- -' occurs
|
20
|
+
value = value.gsub(/-\s-/,'--')
|
21
|
+
|
22
|
+
#Fix for "em" dashes - two types?
|
23
|
+
value = value.gsub('—','--')
|
24
|
+
|
25
|
+
#Fix for "em" dashes - two types?
|
26
|
+
value = value.gsub('–','--')
|
27
|
+
|
28
|
+
#Fix for ' - ' combinations
|
29
|
+
value = value.gsub(' - ','--')
|
30
|
+
|
31
|
+
#Remove white space after and before '--'
|
32
|
+
value = value.gsub(/\s+--/,'--')
|
33
|
+
value = value.gsub(/--\s+/,'--')
|
34
|
+
|
35
|
+
#Ensure first work is capitalized
|
36
|
+
value[0] = value.first.capitalize[0]
|
37
|
+
|
38
|
+
#Strip an white space
|
39
|
+
value = BplEnrich.strip_value(value)
|
40
|
+
|
41
|
+
return value
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
data/lib/bpl_enrich.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module BplEnrich
|
2
|
+
require "bpl_enrich/lcsh"
|
3
|
+
require "bpl_enrich/dates"
|
4
|
+
require "bpl_enrich/constants"
|
5
|
+
require "bpl_enrich/authorities"
|
6
|
+
require "timeliness"
|
7
|
+
require "unidecoder"
|
8
|
+
require "htmlentities"
|
9
|
+
require "qa"
|
10
|
+
|
11
|
+
def self.strip_value(value)
|
12
|
+
if(value.blank?)
|
13
|
+
return nil
|
14
|
+
else
|
15
|
+
if value.class == Float || value.class == Fixnum
|
16
|
+
value = value.to_i.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
# Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
|
20
|
+
return utf8Encode(value)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#TODO: Better name for this. Should be part of an overall helped gem.
|
25
|
+
def self.utf8Encode(value)
|
26
|
+
return ::HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class AuthoritiesTest < ActiveSupport::TestCase
|
4
|
+
def test_parse_language
|
5
|
+
result = BplEnrich::Authorities.parse_language('eng')
|
6
|
+
assert_equal 'English', result[:label]
|
7
|
+
assert_equal 'http://id.loc.gov/vocabulary/iso639-2/eng', result[:uri]
|
8
|
+
|
9
|
+
result = BplEnrich::Authorities.parse_language('English')
|
10
|
+
assert_equal 'English', result[:label]
|
11
|
+
assert_equal 'http://id.loc.gov/vocabulary/iso639-2/eng', result[:uri]
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_parse_role
|
15
|
+
|
16
|
+
result = BplEnrich::Authorities.parse_role('Contributor')
|
17
|
+
assert_equal 'Contributor', result[:label]
|
18
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
19
|
+
|
20
|
+
#FIXME: Using URI doesn't seem to work in this vocab?
|
21
|
+
#result = BplEnrich::Authorities.parse_role('ctb')
|
22
|
+
#assert_equal 'Contributor', result[:label]
|
23
|
+
#assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_parse_name_for_role
|
27
|
+
|
28
|
+
result = BplEnrich::Authorities.parse_name_for_role('Steven Anderson (Contributor)')
|
29
|
+
assert_equal 'Steven Anderson', result[:name]
|
30
|
+
assert_equal 'Contributor', result[:label]
|
31
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/ctb', result[:uri]
|
32
|
+
|
33
|
+
result = BplEnrich::Authorities.parse_name_for_role('Steven Anderson (Painter)')
|
34
|
+
assert_equal 'Steven Anderson (Painter)', result[:name]
|
35
|
+
assert_equal nil, result[:label]
|
36
|
+
assert_equal nil, result[:uri]
|
37
|
+
|
38
|
+
#Special non-Ascii character check
|
39
|
+
result = BplEnrich::Authorities.parse_name_for_role('Sully, François (Photographer)')
|
40
|
+
assert_equal 'Sully, François', result[:name]
|
41
|
+
assert_equal 'Photographer', result[:label]
|
42
|
+
assert_equal 'http://id.loc.gov/vocabulary/relators/pht', result[:uri]
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
data/test/dates_test.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class DatesTest < ActiveSupport::TestCase
|
4
|
+
def test_date_standardizer
|
5
|
+
result = BplEnrich::Dates.standardize('April 1983')
|
6
|
+
assert_equal '1983-04', result[:single_date]
|
7
|
+
assert_equal nil, result[:date_range]
|
8
|
+
assert_equal nil, result[:date_note]
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
== README
|
2
|
+
|
3
|
+
This README would normally document whatever steps are necessary to get the
|
4
|
+
application up and running.
|
5
|
+
|
6
|
+
Things you may want to cover:
|
7
|
+
|
8
|
+
* Ruby version
|
9
|
+
|
10
|
+
* System dependencies
|
11
|
+
|
12
|
+
* Configuration
|
13
|
+
|
14
|
+
* Database creation
|
15
|
+
|
16
|
+
* Database initialization
|
17
|
+
|
18
|
+
* How to run the test suite
|
19
|
+
|
20
|
+
* Services (job queues, cache servers, search engines, etc.)
|
21
|
+
|
22
|
+
* Deployment instructions
|
23
|
+
|
24
|
+
* ...
|
25
|
+
|
26
|
+
|
27
|
+
Please feel free to use a different markup language if you do not plan to run
|
28
|
+
<tt>rake doc:app</tt>.
|
data/test/dummy/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
+
// listed below.
|
3
|
+
//
|
4
|
+
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
+
// or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path.
|
6
|
+
//
|
7
|
+
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
+
// compiled file.
|
9
|
+
//
|
10
|
+
// Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details
|
11
|
+
// about supported directives.
|
12
|
+
//
|
13
|
+
//= require_tree .
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
+
* listed below.
|
4
|
+
*
|
5
|
+
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
+
* or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path.
|
7
|
+
*
|
8
|
+
* You're free to add application-wide styles to this file and they'll appear at the top of the
|
9
|
+
* compiled file, but it's generally better to create a new file per style scope.
|
10
|
+
*
|
11
|
+
*= require_self
|
12
|
+
*= require_tree .
|
13
|
+
*/
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Dummy</title>
|
5
|
+
<%= stylesheet_link_tag "application", media: "all", "data-turbolinks-track" => true %>
|
6
|
+
<%= javascript_include_tag "application", "data-turbolinks-track" => true %>
|
7
|
+
<%= csrf_meta_tags %>
|
8
|
+
</head>
|
9
|
+
<body>
|
10
|
+
|
11
|
+
<%= yield %>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
data/test/dummy/bin/rake
ADDED