marc2linkeddata 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env_example +62 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +234 -0
- data/bin/console +8 -0
- data/bin/loc_downloads.sh +62 -0
- data/bin/loc_import_4store.sh +24 -0
- data/bin/loc_import_allegrograph.sh +22 -0
- data/bin/loc_import_marklogic.sh +19 -0
- data/bin/readMarcAuthority +113 -0
- data/lib/marc2linkeddata/configuration.rb +146 -0
- data/lib/marc2linkeddata/isni.rb +23 -0
- data/lib/marc2linkeddata/lib_auth.rb +17 -0
- data/lib/marc2linkeddata/loc.rb +91 -0
- data/lib/marc2linkeddata/oclc_creative_work.rb +44 -0
- data/lib/marc2linkeddata/oclc_identity.rb +46 -0
- data/lib/marc2linkeddata/oclc_resource.rb +79 -0
- data/lib/marc2linkeddata/oclc_work.rb +19 -0
- data/lib/marc2linkeddata/parseMarcAuthority.rb +492 -0
- data/lib/marc2linkeddata/readMarcCatalog.rb +175 -0
- data/lib/marc2linkeddata/resource.rb +131 -0
- data/lib/marc2linkeddata/sparql.rb +55 -0
- data/lib/marc2linkeddata/viaf.rb +48 -0
- data/lib/marc2linkeddata.rb +64 -0
- data/marc2linkeddata.gemspec +40 -0
- data/spec/marc2linkeddata/configuration_spec.rb +84 -0
- data/spec/marc2linkeddata/loc_spec.rb +71 -0
- data/spec/marc2linkeddata/resource_spec.rb +53 -0
- data/spec/marc2linkeddata/viaf_spec.rb +53 -0
- data/spec/marc2linkeddata_spec.rb +39 -0
- data/spec/spec_helper.rb +92 -0
- metadata +243 -0
@@ -0,0 +1,492 @@
|
|
1
|
+
|
2
|
+
# Marc21 Authority fields are documented at
|
3
|
+
# http://www.loc.gov/marc/authority/ecadlist.html
|
4
|
+
# http://www.loc.gov/marc/authority/ecadhome.html
|
5
|
+
|
6
|
+
module Marc2LinkedData
|
7
|
+
|
8
|
+
class ParseMarcAuthority
|
9
|
+
|
10
|
+
# TODO: provide iterator pattern on an entire file of records.
|
11
|
+
# @leader = ParseMarcAuthority::parse_leader(marc_file)
|
12
|
+
# raw = marc_file.read(@leader[:length])
|
13
|
+
# @record = MARC::Reader.decode(raw)
|
14
|
+
|
15
|
+
@@config = nil
|
16
|
+
|
17
|
+
attr_reader :loc
|
18
|
+
attr_reader :isni
|
19
|
+
attr_reader :viaf
|
20
|
+
|
21
|
+
def initialize(record)
|
22
|
+
@@config ||= Marc2LinkedData.configuration
|
23
|
+
@record = record
|
24
|
+
@graph = RDF::Graph.new
|
25
|
+
@loc = nil
|
26
|
+
@isni = nil
|
27
|
+
@viaf = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_fields(field_num)
|
31
|
+
fields = @record.fields.select {|f| f if f.tag == field_num }
|
32
|
+
raise "Invalid data in field #{field_num}" if fields.length < 1
|
33
|
+
fields
|
34
|
+
end
|
35
|
+
|
36
|
+
# Try to use the SUL catkey and/or the OCLC control numbers, maybe SUL
|
37
|
+
# catkey in the record IRI
|
38
|
+
def get_id
|
39
|
+
# extract ID from control numbers, see
|
40
|
+
# http://www.loc.gov/marc/authority/ad001.html
|
41
|
+
#field001 = record.fields.select {|f| f if f.tag == '001' }.first.value
|
42
|
+
#field003 = record.fields.select {|f| f if f.tag == '003' }.first.value
|
43
|
+
#"#{field003}-#{field001}"
|
44
|
+
get_fields('001').first.value
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_iri(field, iri_pattern)
|
48
|
+
begin
|
49
|
+
iris = field.subfields.collect {|f| f.value if f.value.include? iri_pattern }
|
50
|
+
iris.first || nil
|
51
|
+
rescue
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_iri4isni
|
57
|
+
isni_iri = nil
|
58
|
+
begin
|
59
|
+
# e.g. http://www.isni.org/0000000109311081
|
60
|
+
field = get_fields(@@config.field_auth_isni).first
|
61
|
+
isni_iri = get_iri(field, 'isni.org')
|
62
|
+
# If ISNI is not already in the MARC record, try to get it from VIAF.
|
63
|
+
if isni_iri.nil? && @@config.get_isni
|
64
|
+
isni_iri = @viaf.get_isni rescue nil
|
65
|
+
@@config.logger.debug 'Failed to resolve ISNI URI' if isni_iri.nil?
|
66
|
+
# binding.pry if @viaf.iri.to_s.include? '67737121' #@@config.debug
|
67
|
+
end
|
68
|
+
unless isni_iri.nil?
|
69
|
+
# Ensure the ISNI IRI has this prefix: http://www.isni.org/isni/
|
70
|
+
isni_iri.gsub('www.isni.org', 'www.isni.org/isni') unless isni_iri =~ /www\.isni\.org\/isni\//
|
71
|
+
end
|
72
|
+
return isni_iri
|
73
|
+
rescue
|
74
|
+
nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_iri4lib
|
79
|
+
"#{@@config.prefixes['lib_auth']}#{get_id}"
|
80
|
+
end
|
81
|
+
|
82
|
+
def get_iri4loc
|
83
|
+
loc_iri = nil
|
84
|
+
begin
|
85
|
+
# e.g. http://id.loc.gov/authorities/names/n42000906
|
86
|
+
field = get_fields(@@config.field_auth_loc).first
|
87
|
+
loc_iri = get_iri(field, 'id.loc.gov')
|
88
|
+
rescue
|
89
|
+
end
|
90
|
+
begin
|
91
|
+
if loc_iri.nil?
|
92
|
+
# If the LOC is not in the marc record, try to determine the LOC IRI from the ID.
|
93
|
+
loc_id = get_id
|
94
|
+
if loc_id =~ /^n/i
|
95
|
+
loc_iri = "#{@@config.prefixes['loc_names']}#{loc_id.downcase}"
|
96
|
+
end
|
97
|
+
if loc_id =~ /^sh/i
|
98
|
+
loc_iri = "#{@@config.prefixes['loc_subjects']}#{loc_id.downcase}"
|
99
|
+
end
|
100
|
+
unless loc_iri.nil?
|
101
|
+
# Verify the URL (used HEAD so it's as fast as possible)
|
102
|
+
@@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
|
103
|
+
res = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
|
104
|
+
case res.code
|
105
|
+
when '200'
|
106
|
+
# it's good to go
|
107
|
+
when '301'
|
108
|
+
# use the redirection
|
109
|
+
loc_iri = res['location']
|
110
|
+
when '302','303'
|
111
|
+
#302 Moved Temporarily
|
112
|
+
#303 See Other
|
113
|
+
# Use the current URL, most get requests will follow a 302 or 303
|
114
|
+
else
|
115
|
+
loc_iri = nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
if loc_iri.nil?
|
119
|
+
# If it gets here, it's a problem.
|
120
|
+
binding.pry if @@config.debug
|
121
|
+
@@config.logger.error 'FAILURE to resolve LOC IRI'
|
122
|
+
else
|
123
|
+
@@config.logger.debug "DISCOVERED LOC IRI: #{loc_iri}"
|
124
|
+
end
|
125
|
+
else
|
126
|
+
@@config.logger.debug "MARC contains LOC IRI: #{loc_iri}"
|
127
|
+
end
|
128
|
+
return loc_iri
|
129
|
+
rescue
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def get_iri4oclc
|
135
|
+
begin
|
136
|
+
field = get_fields(@@config.field_auth_oclc).first
|
137
|
+
oclc_cn = field.subfields.collect {|f| f.value if f.code == 'a'}.first
|
138
|
+
oclc_id = /\d+$/.match(oclc_cn).to_s
|
139
|
+
oclc_id.empty? ? nil : "http://www.worldcat.org/oclc/#{oclc_id}"
|
140
|
+
rescue
|
141
|
+
nil
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def get_iri4viaf
|
146
|
+
begin
|
147
|
+
# e.g. http://viaf.org/viaf/181829329
|
148
|
+
# VIAF RSS feed for changes, e.g. http://viaf.org/viaf/181829329.rss
|
149
|
+
field = get_fields(@@config.field_auth_viaf).first
|
150
|
+
viaf_iri = get_iri(field, 'viaf.org')
|
151
|
+
# If VIAF is not already in the MARC record, try to get from LOC.
|
152
|
+
if viaf_iri.nil? && @@config.get_viaf
|
153
|
+
viaf_iri = @loc.get_viaf rescue nil
|
154
|
+
@@config.logger.debug 'Failed to resolve VIAF URI' if viaf_iri.nil?
|
155
|
+
end
|
156
|
+
return viaf_iri
|
157
|
+
rescue
|
158
|
+
nil
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.parse_leader(file_handle, leader_bytes=24)
|
163
|
+
# example:
|
164
|
+
#record.leader
|
165
|
+
#=> "00774cz a2200253n 4500"
|
166
|
+
# 00-04: '00774' - record length
|
167
|
+
# 05: 'c' - corrected or revised
|
168
|
+
# 06: 'z' - always 'z' for authority records
|
169
|
+
# 09: 'a' - UCS/Unicode
|
170
|
+
# 12-16: '00253' - base address of data, Length of Leader and Directory
|
171
|
+
# 17: 'n' - Complete authority record
|
172
|
+
# leader_status_codes = {
|
173
|
+
# 'a' => 'Increase in encoding level',
|
174
|
+
# 'c' => 'Corrected or revised',
|
175
|
+
# 'd' => 'Deleted',
|
176
|
+
# 'n' => 'New',
|
177
|
+
# 'o' => 'Obsolete',
|
178
|
+
# 's' => 'Deleted; heading split into two or more headings',
|
179
|
+
# 'x' => 'Deleted; heading replaced by another heading'
|
180
|
+
# }
|
181
|
+
leader = file_handle.read(leader_bytes)
|
182
|
+
file_handle.seek(-1 * leader_bytes, IO::SEEK_CUR)
|
183
|
+
{
|
184
|
+
:length => leader[0..4].to_i,
|
185
|
+
:status => leader[5], # leader_status_codes[ record.leader[5] ]
|
186
|
+
:type => leader[6], # always 'z' for authority records
|
187
|
+
:encoding => leader[9], # translate letter code into ruby encoding string
|
188
|
+
:data_address => leader[12..16].to_i,
|
189
|
+
:complete => leader[17].include?('n')
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
def parse_008
|
194
|
+
# http://www.loc.gov/marc/authority/concise/ad008.html
|
195
|
+
field = get_fields('008').first
|
196
|
+
field008 = field.value
|
197
|
+
languages = []
|
198
|
+
languages.append('English') if ['b','e'].include? field008[8]
|
199
|
+
languages.append('French') if ['b','f'].include? field008[8]
|
200
|
+
rules = ''
|
201
|
+
rules = 'EARLIER' if field008[10] == 'a'
|
202
|
+
rules = 'AACR1' if field008[10] == 'b'
|
203
|
+
rules = 'AACR2' if field008[10] == 'c'
|
204
|
+
rules = 'AACR2 compatible' if field008[10] == 'd'
|
205
|
+
rules = 'OTHER' if field008[10] == 'z'
|
206
|
+
rules = 'N/A' if field008[10] == 'n'
|
207
|
+
# 32 - Undifferentiated personal name
|
208
|
+
# Whether the personal name in a name or name/title heading contained in field 100 in an established heading record or a reference record is used by one person or by two or more persons.
|
209
|
+
# a - Differentiated personal name
|
210
|
+
# Personal name in field 100 is a unique name.
|
211
|
+
# b - Undifferentiated personal name
|
212
|
+
# Personal name in field 100 is used by two or more persons.
|
213
|
+
# n - Not applicable
|
214
|
+
# 1XX heading is not a personal name or the personal name is a family name.
|
215
|
+
# | - No attempt to code
|
216
|
+
{
|
217
|
+
:date => Date.strptime(field008[0..5], "%y%m%d"),
|
218
|
+
:geographic_subdivision => field008[6], # '#', d, i, n, or '|'
|
219
|
+
:romanization_scheme => field008[7], # a..g, n, or '|'
|
220
|
+
:languages => languages,
|
221
|
+
:kind => field008[9], # a..g, or '|'
|
222
|
+
:rules => rules,
|
223
|
+
:heading_system => field008[11],
|
224
|
+
:series_type => field008[12],
|
225
|
+
:series_numbered => field008[13],
|
226
|
+
:use_1XX_for_7XX => field008[14] == 'a',
|
227
|
+
:use_1XX_for_6XX => field008[15] == 'a',
|
228
|
+
:use_1XX_for_4XX => field008[16] == 'a',
|
229
|
+
:use_1XX_for_8XX => field008[16] == 'a',
|
230
|
+
:type_subject_subdivision => field008[17],
|
231
|
+
# 18-27 - Undefined character positions
|
232
|
+
:type_government_agency => field008[28],
|
233
|
+
:reference_evaluation => field008[29],
|
234
|
+
# 30 - Undefined character position
|
235
|
+
:record_available => field008[31] == 'a',
|
236
|
+
# TODO: 32
|
237
|
+
# TODO: 33
|
238
|
+
# 34-37 - Undefined character positions
|
239
|
+
# TODO: 38
|
240
|
+
# TODO: 39
|
241
|
+
}
|
242
|
+
end
|
243
|
+
|
244
|
+
def parse_100
|
245
|
+
# http://www.loc.gov/marc/authority/concise/ad100.html
|
246
|
+
begin
|
247
|
+
# 100 is a personal name
|
248
|
+
field = get_fields('100').first
|
249
|
+
# field = @record.fields.select {|f| f if f.tag == '100' }.first
|
250
|
+
name = field.subfields.select {|f| f.code == 'a' }.first.value rescue ''
|
251
|
+
name.force_encoding('UTF-8')
|
252
|
+
rescue
|
253
|
+
'ERROR_PERSONAL_NAME'
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def parse_110
|
258
|
+
# http://www.loc.gov/marc/authority/concise/ad110.html
|
259
|
+
begin
|
260
|
+
# 110 is a corporate name
|
261
|
+
field = @record.fields.select {|f| f if f.tag == '110' }.first
|
262
|
+
a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
|
263
|
+
b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
|
264
|
+
c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
|
265
|
+
name = [a,b,c].flatten.join(' : ')
|
266
|
+
name.force_encoding('UTF-8')
|
267
|
+
rescue
|
268
|
+
'ERROR_CORPORATE_NAME'
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def parse_111
|
273
|
+
# http://www.loc.gov/marc/authority/concise/ad111.html
|
274
|
+
begin
|
275
|
+
# 111 is a meeting name
|
276
|
+
field = @record.fields.select {|f| f if f.tag == '111' }.first
|
277
|
+
a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
|
278
|
+
# TODO: incorporate additional subfields?
|
279
|
+
# b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
|
280
|
+
# c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
|
281
|
+
# name = [a,b,c].flatten.join(' : ')
|
282
|
+
# name.force_encoding('UTF-8')
|
283
|
+
a.force_encoding('UTF-8')
|
284
|
+
rescue
|
285
|
+
'ERROR_MEETING_NAME'
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def parse_151
|
290
|
+
# http://www.loc.gov/marc/authority/concise/ad151.html
|
291
|
+
begin
|
292
|
+
# 151 is a geographic name
|
293
|
+
field = @record.fields.select {|f| f if f.tag == '151' }.first
|
294
|
+
name = field.subfields.collect {|f| f.value if f.code == 'a' }.first rescue ''
|
295
|
+
name.force_encoding('UTF-8')
|
296
|
+
rescue
|
297
|
+
'ERROR_PLACE_NAME'
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def parse_loc_auth_name
|
302
|
+
#
|
303
|
+
# Create triples for various kinds of LOC authority.
|
304
|
+
# At present, this relies on LOC RDF to differentiate
|
305
|
+
# types of authorities. It should be possible to do this
|
306
|
+
# from the MARC directly, if @@config.get_loc is false.
|
307
|
+
#
|
308
|
+
# The MARC data differentiates them according to the tag number.
|
309
|
+
# The term 'name' refers to:
|
310
|
+
# X00 - Personal Name
|
311
|
+
# X10 - Corporate Name
|
312
|
+
# X11 - Meeting Name
|
313
|
+
# X30 - Uniform Title
|
314
|
+
# X51 - Jurisdiction / Geographic Name
|
315
|
+
#
|
316
|
+
@@config.logger.warn "LOC URL: #{@loc.iri} DEPRECATED" if @loc.deprecated?
|
317
|
+
name = ''
|
318
|
+
if @loc.conference?
|
319
|
+
# e.g. http://id.loc.gov/authorities/names/n79044866
|
320
|
+
name = @loc.label || parse_111
|
321
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.event)
|
322
|
+
elsif @loc.corporation?
|
323
|
+
name = @loc.label || parse_110
|
324
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Organization) if @@config.use_foaf
|
325
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Organization) if @@config.use_schema
|
326
|
+
elsif @loc.name_title?
|
327
|
+
# e.g. http://id.loc.gov/authorities/names/n79044934
|
328
|
+
# Skipping these, because the person entity should be in
|
329
|
+
# an additional record and we don't want the title content.
|
330
|
+
binding.pry if @@config.debug
|
331
|
+
return ''
|
332
|
+
elsif @loc.person?
|
333
|
+
name = @loc.label || parse_100
|
334
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Person) if @@config.use_foaf
|
335
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Person) if @@config.use_schema
|
336
|
+
# VIAF extracts first and last name, try to use them. Note
|
337
|
+
# that VIAF uses schema:name, schema:givenName, and schema:familyName.
|
338
|
+
if @@config.get_viaf && ! @viaf.nil?
|
339
|
+
@viaf.family_names.each do |n|
|
340
|
+
# ln = URI.encode(n)
|
341
|
+
# TODO: try to get a language type, if VIAF provide it.
|
342
|
+
# name = RDF::Literal.new(n, :language => :en)
|
343
|
+
ln = RDF::Literal.new(n)
|
344
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.familyName, ln) if @@config.use_foaf
|
345
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.familyName, ln) if @@config.use_schema
|
346
|
+
end
|
347
|
+
@viaf.given_names.each do |n|
|
348
|
+
# fn = URI.encode(n)
|
349
|
+
# TODO: try to get a language type, if VIAF provide it.
|
350
|
+
# name = RDF::Literal.new(n, :language => :en)
|
351
|
+
fn = RDF::Literal.new(n)
|
352
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.firstName, fn) if @@config.use_foaf
|
353
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.givenName, fn) if @@config.use_schema
|
354
|
+
end
|
355
|
+
end
|
356
|
+
elsif @loc.place?
|
357
|
+
# e.g. http://id.loc.gov/authorities/names/n79045127
|
358
|
+
name = @loc.label || parse_151
|
359
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Place)
|
360
|
+
else
|
361
|
+
# TODO: find out what type this is.
|
362
|
+
binding.pry if @@config.debug
|
363
|
+
name = @loc.label || ''
|
364
|
+
# Note: schema.org has no immediate parent for Person or Organization
|
365
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Agent) if @@config.use_foaf
|
366
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Thing) if @@config.use_schema
|
367
|
+
end
|
368
|
+
if name != ''
|
369
|
+
# name_encoding = URI.encode(name)
|
370
|
+
name = RDF::Literal.new(name)
|
371
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.name, name) if @@config.use_foaf
|
372
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.name, name) if @@config.use_schema
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def parse_loc_auth_subject
|
377
|
+
# TODO: what to do with subjects?
|
378
|
+
binding.pry if @@config.debug
|
379
|
+
# The term 'subject' refers to:
|
380
|
+
# X30 - Uniform Titles
|
381
|
+
# X48 - Chronological Terms
|
382
|
+
# X50 - Topical Terms
|
383
|
+
# X51 - Geographic Names
|
384
|
+
# X55 - Genre/Form Terms
|
385
|
+
#
|
386
|
+
# The term 'subject subdivision' refers to:
|
387
|
+
# X80 - general subdivision terms
|
388
|
+
# X81 - geographic subdivision names
|
389
|
+
# X82 - chronological subdivision terms
|
390
|
+
# X85 - form subdivision terms
|
391
|
+
end
|
392
|
+
|
393
|
+
def get_oclc_links
|
394
|
+
oclc_iri = nil
|
395
|
+
begin
|
396
|
+
# Try to get OCLC using LOC ID.
|
397
|
+
oclc_iri = @loc.get_oclc_identity
|
398
|
+
rescue
|
399
|
+
# Try to get OCLC using 035a field data, but
|
400
|
+
# this is not as reliable/accurate as LOC.
|
401
|
+
oclc_iri = get_iri4oclc
|
402
|
+
end
|
403
|
+
unless oclc_iri.nil?
|
404
|
+
# Try to get additional data from OCLC, using the RDFa
|
405
|
+
# available in the OCLC identities pages.
|
406
|
+
oclc_auth = OclcIdentity.new oclc_iri
|
407
|
+
@graph.insert RDF::Statement(@loc.rdf_uri, RDF::OWL.sameAs, oclc_auth.rdf_uri)
|
408
|
+
oclc_auth.creative_works.each do |creative_work_uri|
|
409
|
+
# Notes on work-around for OCLC data inconsistency:
|
410
|
+
# RDFa for http://www.worldcat.org/identities/lccn-n79044798 contains:
|
411
|
+
# <http://worldcat.org/oclc/747413718> a <http://schema.org/CreativeWork> .
|
412
|
+
# However, the RDF for <http://worldcat.org/oclc/747413718> contains:
|
413
|
+
# <http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://worldcat.org/entity/work/id/994448191> .
|
414
|
+
# Note how the subject here is 'WWW.worldcat.org' instead of 'worldcat.org'.
|
415
|
+
#creative_work_iri = creative_work.to_s.gsub('worldcat.org','www.worldcat.org')
|
416
|
+
#creative_work_iri = creative_work_iri.gsub('wwwwww','www') # in case it gets added already by OCLC
|
417
|
+
creative_work = OclcCreativeWork.new creative_work_uri
|
418
|
+
@graph.insert RDF::Statement(oclc_auth.rdf_uri, RDF::RDFS.seeAlso, creative_work.rdf_uri)
|
419
|
+
if @@config.oclc_auth2works
|
420
|
+
# Try to use VIAF to relate auth to work as creator, contributor, editor, etc.
|
421
|
+
# Note that this requires additional RDF retrieval for each work (slower processing).
|
422
|
+
unless @viaf.nil?
|
423
|
+
if creative_work.creator? @viaf.iri
|
424
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.creator, oclc_auth.rdf_uri)
|
425
|
+
elsif creative_work.contributor? @viaf.iri
|
426
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.contributor, oclc_auth.rdf_uri)
|
427
|
+
elsif creative_work.editor? @viaf.iri
|
428
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.editor, oclc_auth.rdf_uri)
|
429
|
+
end
|
430
|
+
end
|
431
|
+
# TODO: Is auth the subject of the work (as in biography) or both (as in autobiography).
|
432
|
+
# binding.pry if @@config.debug
|
433
|
+
# binding.pry if creative_work.iri.to_s == 'http://www.worldcat.org/oclc/006626542'
|
434
|
+
# Try to find the generic work entity for this example work.
|
435
|
+
creative_work.get_works.each do |oclc_work_uri|
|
436
|
+
oclc_work = OclcWork.new oclc_work_uri
|
437
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.exampleOfWork, oclc_work.rdf_uri)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
# TODO: use an 'affiliation' entry, maybe 373? (optional field)
|
445
|
+
|
446
|
+
def to_ttl
|
447
|
+
graph.to_ttl
|
448
|
+
end
|
449
|
+
|
450
|
+
def graph
|
451
|
+
# TODO: figure out how to specify all the graph prefixes.
|
452
|
+
return @graph unless @graph.empty?
|
453
|
+
@lib = LibAuth.new get_iri4lib
|
454
|
+
# Try to find LOC, VIAF, and ISNI IRIs in the MARC record
|
455
|
+
@loc = Loc.new get_iri4loc rescue nil
|
456
|
+
# Try to identify problems in getting an LOC IRI.
|
457
|
+
binding.pry if (@@config.debug && @loc.nil?)
|
458
|
+
# might require LOC to get ISNI.
|
459
|
+
@viaf = Viaf.new get_iri4viaf rescue nil
|
460
|
+
# might require VIAF to get ISNI.
|
461
|
+
@isni = Isni.new get_iri4isni rescue nil
|
462
|
+
|
463
|
+
# TODO: ORCID? VIVO? VITRO? Stanford CAP?
|
464
|
+
|
465
|
+
# Get LOC control number and add catalog permalink? e.g.
|
466
|
+
# http://lccn.loc.gov/n79046291
|
467
|
+
|
468
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @loc.rdf_uri)
|
469
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @viaf.rdf_uri) unless @viaf.nil?
|
470
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @isni.rdf_uri) unless @isni.nil?
|
471
|
+
return @graph unless @@config.get_loc
|
472
|
+
|
473
|
+
# TODO: find codes in the marc record to differentiate the authority into
|
474
|
+
# TODO: person, organization, event, etc. without getting LOC RDF.
|
475
|
+
|
476
|
+
if @loc.iri.to_s =~ /name/
|
477
|
+
parse_loc_auth_name
|
478
|
+
elsif @loc.iri.to_s =~ /subjects/
|
479
|
+
parse_loc_auth_subject
|
480
|
+
else
|
481
|
+
binding.pry if @@config.debug
|
482
|
+
end
|
483
|
+
# Optional elaboration of authority data with OCLC identity and works.
|
484
|
+
get_oclc_links if @@config.get_oclc
|
485
|
+
|
486
|
+
@@config.logger.info "Extracted #{@loc.id}"
|
487
|
+
@graph
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
end
|
492
|
+
|
@@ -0,0 +1,175 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'marc'
|
4
|
+
require 'linkeddata'
|
5
|
+
require 'pry'
|
6
|
+
|
7
|
+
#EXAMPLE_RECORD_FILE='../marc/catalog/stf.00.mrc'
|
8
|
+
EXAMPLE_RECORD_FILE='../marc/catalog/stf.51.mrc'
|
9
|
+
|
10
|
+
## reading records from a batch file
|
11
|
+
#reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "MARC-8")
|
12
|
+
#reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "UTF-8", :validate_encoding => true)
|
13
|
+
|
14
|
+
#reader = MARC::ForgivingReader.new(EXAMPLE_RECORD_FILE)
|
15
|
+
|
16
|
+
handle = File.new(EXAMPLE_RECORD_FILE)
|
17
|
+
#=> #<File:marc/authority/stf_auth.00.mrc>
|
18
|
+
rec_length = handle.read(5).to_i
|
19
|
+
#=> 774
|
20
|
+
handle.rewind
|
21
|
+
raw = handle.read(rec_length)
|
22
|
+
record = MARC::Reader.decode(raw)
|
23
|
+
|
24
|
+
# From http://www.loc.gov/marc/authority/adleader.html
|
25
|
+
# System-Generated Elements - The following Leader elements are usually system generated:
|
26
|
+
#
|
27
|
+
# 00-04 Logical record length
|
28
|
+
#
|
29
|
+
# 05 - Record status:
|
30
|
+
# a - Increase in encoding level
|
31
|
+
# c - Corrected or revised
|
32
|
+
# d - Deleted
|
33
|
+
# n - New
|
34
|
+
# o - Obsolete
|
35
|
+
# s - Deleted; heading split into two or more headings
|
36
|
+
# x - Deleted; heading replaced by another heading
|
37
|
+
#
|
38
|
+
# 06 - Type of record
|
39
|
+
# z - Authority data
|
40
|
+
#
|
41
|
+
# 07-08 Undefined character positions
|
42
|
+
#
|
43
|
+
# 09 - Character coding scheme
|
44
|
+
# # - MARC-8
|
45
|
+
# a - UCS/Unicode
|
46
|
+
#
|
47
|
+
# 10 Indicator count
|
48
|
+
# 2 - Number of character positions used for indicators
|
49
|
+
#
|
50
|
+
# 11 Subfield code count
|
51
|
+
# 2 - Number of character positions used for a subfield code
|
52
|
+
#
|
53
|
+
# 12-16 Base address of data
|
54
|
+
# [number] - Length of Leader and Directory
|
55
|
+
#
|
56
|
+
# 17 - Encoding level
|
57
|
+
# n - Complete authority record
|
58
|
+
# o - Incomplete authority record
|
59
|
+
#
|
60
|
+
# 20-23 Entry map
|
61
|
+
#
|
62
|
+
# 18-19 - Undefined character positions
|
63
|
+
#
|
64
|
+
# 20 - Length of the length-of-field portion
|
65
|
+
# 4 - Number of characters in the length-of-field portion of a Directory entry
|
66
|
+
#
|
67
|
+
# 21 - Length of the starting-character-position portion
|
68
|
+
# 5 - Number of characters in the starting-character-position portion of a Directory entry
|
69
|
+
#
|
70
|
+
# 22 - Length of the implementation-defined portion
|
71
|
+
# 0 - Number of characters in the implementation-defined portion of a Directory entry
|
72
|
+
#
|
73
|
+
# It is common for default values in other Leader elements to be generated automatically as well.
|
74
|
+
# Capitalization - Alphabetic codes are input as lower case letters.
|
75
|
+
#
|
76
|
+
# example:
|
77
|
+
#record.leader
|
78
|
+
#=> "00774cz a2200253n 4500"
|
79
|
+
# 00-04: '00774' - record length
|
80
|
+
# 05: 'c' - corrected or revised
|
81
|
+
# 06: 'z' - always 'z' for authority records
|
82
|
+
# 09: 'a' - UCS/Unicode
|
83
|
+
# 12-16: '00253' - base address of data, Length of Leader and Directory
|
84
|
+
# 17: 'n' - Complete authority record
|
85
|
+
LEADER_STATUS_CODES = {
|
86
|
+
'a' => 'Increase in encoding level',
|
87
|
+
'c' => 'Corrected or revised',
|
88
|
+
'd' => 'Deleted',
|
89
|
+
'n' => 'New',
|
90
|
+
'o' => 'Obsolete',
|
91
|
+
's' => 'Deleted; heading split into two or more headings',
|
92
|
+
'x' => 'Deleted; heading replaced by another heading'
|
93
|
+
}
|
94
|
+
def leader_parse(record)
|
95
|
+
leader = {
|
96
|
+
:length => record.leader[0..4].to_i,
|
97
|
+
:status => record.leader[5], # LEADER_STATUS_CODES[ record.leader[5] ]
|
98
|
+
:type => record.leader[6], # always 'z' for authority records
|
99
|
+
:encoding => record.leader[9], # TODO: translate letter code into ruby encoding string
|
100
|
+
:data_address => record.leader[12..16].to_i,
|
101
|
+
:complete => record.leader[17].include?('n')
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# Stanford Resource keys and Ckeys can collide. They are only unique within their own set.
|
107
|
+
#
|
108
|
+
# When I do a catalogdump for ckey 6809804 I see:
|
109
|
+
#
|
110
|
+
# .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
|
111
|
+
#
|
112
|
+
# When we do a catalogdump for searchworks we filter the results to only export
|
113
|
+
# records with holdings, and not those things which are on order or "shadowed"
|
114
|
+
# i.e. hidden from public view, although we still have the bibliographic data in
|
115
|
+
# the database. When I extracted the records for conversion I selected all of
|
116
|
+
# them.
|
117
|
+
#
|
118
|
+
# - Josh
|
119
|
+
|
120
|
+
|
121
|
+
# Create SUL LOD...
|
122
|
+
SUL_URI = RDF::URI.new('http://linked-data.stanford.edu/library/')
|
123
|
+
|
124
|
+
# extract catalog key from field 001 (use the first one)
|
125
|
+
field001 = record.fields.select {|f| f if f.tag == '001' }.first
|
126
|
+
cat_key = field001.value.strip
|
127
|
+
CAT_URI = SUL_URI.join("catalog/#{cat_key}")
|
128
|
+
|
129
|
+
# TODO: Evaluate whether cat_key is in SearchWorks, e.g.
|
130
|
+
# "http://searchworks.stanford.edu/catalog/#{cat_key}"
|
131
|
+
# http://searchworks.stanford.edu/catalog/7106054
|
132
|
+
|
133
|
+
# TODO: extract 035a for OCLC master control number.
|
134
|
+
# TODO: map the OCLC to the OCLC work number.
|
135
|
+
field035 = record.fields.select {|f| f if f.tag == '035' }
|
136
|
+
|
137
|
+
|
138
|
+
binding.pry
|
139
|
+
exit!
|
140
|
+
|
141
|
+
|
142
|
+
#There is nothing in the MARC record itself to indicate that a holding is
|
143
|
+
#'shadowed' (not available for public view), but one idea to handle them is
|
144
|
+
#to supply a list of shadowed ckeys and that list could easily be transformed
|
145
|
+
#
|
146
|
+
#into a list of triples like this:
|
147
|
+
# <http://linked-data.stanford.edu/library/catalog/{cKey}> <rdf:Property> <http://linked-data.stanford.edu/library/catalog/isShadowed>
|
148
|
+
#...or what ever predicate and object you want to use. Then you can load those into the triple store.
|
149
|
+
|
150
|
+
# .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
|
151
|
+
# When we do a catalogdump for searchworks we filter the results to only export
|
152
|
+
# records with holdings, and not those things which are on order or "shadowed"
|
153
|
+
# i.e. hidden from public view, although we still have the bibliographic data in
|
154
|
+
# the database. When I extracted the records for conversion I selected all of
|
155
|
+
# them.
|
156
|
+
field948 = record.fields.select {|f| f if f.tag == '948' }
|
157
|
+
holdings = field948.first.value
|
158
|
+
|
159
|
+
|
160
|
+
# TODO: construct RDF model, see http://blog.datagraph.org/2010/03/rdf-for-ruby
|
161
|
+
# RDF::Literal.new("Hello!", :language => :en)
|
162
|
+
#
|
163
|
+
lod = {
|
164
|
+
:id => cat_uri,
|
165
|
+
:oclc => oclc4loc.collect {|uri| RDF::URI.new(uri) },
|
166
|
+
}
|
167
|
+
|
168
|
+
binding.pry
|
169
|
+
exit!
|
170
|
+
|
171
|
+
#for record in reader
|
172
|
+
# # print out field 245 subfield a
|
173
|
+
# puts record['245']['a']
|
174
|
+
#end
|
175
|
+
|