marc2linkeddata 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.env_example +62 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +234 -0
- data/bin/console +8 -0
- data/bin/loc_downloads.sh +62 -0
- data/bin/loc_import_4store.sh +24 -0
- data/bin/loc_import_allegrograph.sh +22 -0
- data/bin/loc_import_marklogic.sh +19 -0
- data/bin/readMarcAuthority +113 -0
- data/lib/marc2linkeddata/configuration.rb +146 -0
- data/lib/marc2linkeddata/isni.rb +23 -0
- data/lib/marc2linkeddata/lib_auth.rb +17 -0
- data/lib/marc2linkeddata/loc.rb +91 -0
- data/lib/marc2linkeddata/oclc_creative_work.rb +44 -0
- data/lib/marc2linkeddata/oclc_identity.rb +46 -0
- data/lib/marc2linkeddata/oclc_resource.rb +79 -0
- data/lib/marc2linkeddata/oclc_work.rb +19 -0
- data/lib/marc2linkeddata/parseMarcAuthority.rb +492 -0
- data/lib/marc2linkeddata/readMarcCatalog.rb +175 -0
- data/lib/marc2linkeddata/resource.rb +131 -0
- data/lib/marc2linkeddata/sparql.rb +55 -0
- data/lib/marc2linkeddata/viaf.rb +48 -0
- data/lib/marc2linkeddata.rb +64 -0
- data/marc2linkeddata.gemspec +40 -0
- data/spec/marc2linkeddata/configuration_spec.rb +84 -0
- data/spec/marc2linkeddata/loc_spec.rb +71 -0
- data/spec/marc2linkeddata/resource_spec.rb +53 -0
- data/spec/marc2linkeddata/viaf_spec.rb +53 -0
- data/spec/marc2linkeddata_spec.rb +39 -0
- data/spec/spec_helper.rb +92 -0
- metadata +243 -0
@@ -0,0 +1,492 @@
|
|
1
|
+
|
2
|
+
# Marc21 Authority fields are documented at
|
3
|
+
# http://www.loc.gov/marc/authority/ecadlist.html
|
4
|
+
# http://www.loc.gov/marc/authority/ecadhome.html
|
5
|
+
|
6
|
+
module Marc2LinkedData
|
7
|
+
|
8
|
+
class ParseMarcAuthority
|
9
|
+
|
10
|
+
# TODO: provide iterator pattern on an entire file of records.
|
11
|
+
# @leader = ParseMarcAuthority::parse_leader(marc_file)
|
12
|
+
# raw = marc_file.read(@leader[:length])
|
13
|
+
# @record = MARC::Reader.decode(raw)
|
14
|
+
|
15
|
+
@@config = nil
|
16
|
+
|
17
|
+
attr_reader :loc
|
18
|
+
attr_reader :isni
|
19
|
+
attr_reader :viaf
|
20
|
+
|
21
|
+
def initialize(record)
|
22
|
+
@@config ||= Marc2LinkedData.configuration
|
23
|
+
@record = record
|
24
|
+
@graph = RDF::Graph.new
|
25
|
+
@loc = nil
|
26
|
+
@isni = nil
|
27
|
+
@viaf = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_fields(field_num)
|
31
|
+
fields = @record.fields.select {|f| f if f.tag == field_num }
|
32
|
+
raise "Invalid data in field #{field_num}" if fields.length < 1
|
33
|
+
fields
|
34
|
+
end
|
35
|
+
|
36
|
+
# Try to use the SUL catkey and/or the OCLC control numbers, maybe SUL
|
37
|
+
# catkey in the record IRI
|
38
|
+
def get_id
|
39
|
+
# extract ID from control numbers, see
|
40
|
+
# http://www.loc.gov/marc/authority/ad001.html
|
41
|
+
#field001 = record.fields.select {|f| f if f.tag == '001' }.first.value
|
42
|
+
#field003 = record.fields.select {|f| f if f.tag == '003' }.first.value
|
43
|
+
#"#{field003}-#{field001}"
|
44
|
+
get_fields('001').first.value
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_iri(field, iri_pattern)
|
48
|
+
begin
|
49
|
+
iris = field.subfields.collect {|f| f.value if f.value.include? iri_pattern }
|
50
|
+
iris.first || nil
|
51
|
+
rescue
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_iri4isni
|
57
|
+
isni_iri = nil
|
58
|
+
begin
|
59
|
+
# e.g. http://www.isni.org/0000000109311081
|
60
|
+
field = get_fields(@@config.field_auth_isni).first
|
61
|
+
isni_iri = get_iri(field, 'isni.org')
|
62
|
+
# If ISNI is not already in the MARC record, try to get it from VIAF.
|
63
|
+
if isni_iri.nil? && @@config.get_isni
|
64
|
+
isni_iri = @viaf.get_isni rescue nil
|
65
|
+
@@config.logger.debug 'Failed to resolve ISNI URI' if isni_iri.nil?
|
66
|
+
# binding.pry if @viaf.iri.to_s.include? '67737121' #@@config.debug
|
67
|
+
end
|
68
|
+
unless isni_iri.nil?
|
69
|
+
# Ensure the ISNI IRI has this prefix: http://www.isni.org/isni/
|
70
|
+
isni_iri.gsub('www.isni.org', 'www.isni.org/isni') unless isni_iri =~ /www\.isni\.org\/isni\//
|
71
|
+
end
|
72
|
+
return isni_iri
|
73
|
+
rescue
|
74
|
+
nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_iri4lib
|
79
|
+
"#{@@config.prefixes['lib_auth']}#{get_id}"
|
80
|
+
end
|
81
|
+
|
82
|
+
def get_iri4loc
|
83
|
+
loc_iri = nil
|
84
|
+
begin
|
85
|
+
# e.g. http://id.loc.gov/authorities/names/n42000906
|
86
|
+
field = get_fields(@@config.field_auth_loc).first
|
87
|
+
loc_iri = get_iri(field, 'id.loc.gov')
|
88
|
+
rescue
|
89
|
+
end
|
90
|
+
begin
|
91
|
+
if loc_iri.nil?
|
92
|
+
# If the LOC is not in the marc record, try to determine the LOC IRI from the ID.
|
93
|
+
loc_id = get_id
|
94
|
+
if loc_id =~ /^n/i
|
95
|
+
loc_iri = "#{@@config.prefixes['loc_names']}#{loc_id.downcase}"
|
96
|
+
end
|
97
|
+
if loc_id =~ /^sh/i
|
98
|
+
loc_iri = "#{@@config.prefixes['loc_subjects']}#{loc_id.downcase}"
|
99
|
+
end
|
100
|
+
unless loc_iri.nil?
|
101
|
+
# Verify the URL (used HEAD so it's as fast as possible)
|
102
|
+
@@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
|
103
|
+
res = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
|
104
|
+
case res.code
|
105
|
+
when '200'
|
106
|
+
# it's good to go
|
107
|
+
when '301'
|
108
|
+
# use the redirection
|
109
|
+
loc_iri = res['location']
|
110
|
+
when '302','303'
|
111
|
+
#302 Moved Temporarily
|
112
|
+
#303 See Other
|
113
|
+
# Use the current URL, most get requests will follow a 302 or 303
|
114
|
+
else
|
115
|
+
loc_iri = nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
if loc_iri.nil?
|
119
|
+
# If it gets here, it's a problem.
|
120
|
+
binding.pry if @@config.debug
|
121
|
+
@@config.logger.error 'FAILURE to resolve LOC IRI'
|
122
|
+
else
|
123
|
+
@@config.logger.debug "DISCOVERED LOC IRI: #{loc_iri}"
|
124
|
+
end
|
125
|
+
else
|
126
|
+
@@config.logger.debug "MARC contains LOC IRI: #{loc_iri}"
|
127
|
+
end
|
128
|
+
return loc_iri
|
129
|
+
rescue
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def get_iri4oclc
|
135
|
+
begin
|
136
|
+
field = get_fields(@@config.field_auth_oclc).first
|
137
|
+
oclc_cn = field.subfields.collect {|f| f.value if f.code == 'a'}.first
|
138
|
+
oclc_id = /\d+$/.match(oclc_cn).to_s
|
139
|
+
oclc_id.empty? ? nil : "http://www.worldcat.org/oclc/#{oclc_id}"
|
140
|
+
rescue
|
141
|
+
nil
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def get_iri4viaf
|
146
|
+
begin
|
147
|
+
# e.g. http://viaf.org/viaf/181829329
|
148
|
+
# VIAF RSS feed for changes, e.g. http://viaf.org/viaf/181829329.rss
|
149
|
+
field = get_fields(@@config.field_auth_viaf).first
|
150
|
+
viaf_iri = get_iri(field, 'viaf.org')
|
151
|
+
# If VIAF is not already in the MARC record, try to get from LOC.
|
152
|
+
if viaf_iri.nil? && @@config.get_viaf
|
153
|
+
viaf_iri = @loc.get_viaf rescue nil
|
154
|
+
@@config.logger.debug 'Failed to resolve VIAF URI' if viaf_iri.nil?
|
155
|
+
end
|
156
|
+
return viaf_iri
|
157
|
+
rescue
|
158
|
+
nil
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.parse_leader(file_handle, leader_bytes=24)
|
163
|
+
# example:
|
164
|
+
#record.leader
|
165
|
+
#=> "00774cz a2200253n 4500"
|
166
|
+
# 00-04: '00774' - record length
|
167
|
+
# 05: 'c' - corrected or revised
|
168
|
+
# 06: 'z' - always 'z' for authority records
|
169
|
+
# 09: 'a' - UCS/Unicode
|
170
|
+
# 12-16: '00253' - base address of data, Length of Leader and Directory
|
171
|
+
# 17: 'n' - Complete authority record
|
172
|
+
# leader_status_codes = {
|
173
|
+
# 'a' => 'Increase in encoding level',
|
174
|
+
# 'c' => 'Corrected or revised',
|
175
|
+
# 'd' => 'Deleted',
|
176
|
+
# 'n' => 'New',
|
177
|
+
# 'o' => 'Obsolete',
|
178
|
+
# 's' => 'Deleted; heading split into two or more headings',
|
179
|
+
# 'x' => 'Deleted; heading replaced by another heading'
|
180
|
+
# }
|
181
|
+
leader = file_handle.read(leader_bytes)
|
182
|
+
file_handle.seek(-1 * leader_bytes, IO::SEEK_CUR)
|
183
|
+
{
|
184
|
+
:length => leader[0..4].to_i,
|
185
|
+
:status => leader[5], # leader_status_codes[ record.leader[5] ]
|
186
|
+
:type => leader[6], # always 'z' for authority records
|
187
|
+
:encoding => leader[9], # translate letter code into ruby encoding string
|
188
|
+
:data_address => leader[12..16].to_i,
|
189
|
+
:complete => leader[17].include?('n')
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
def parse_008
|
194
|
+
# http://www.loc.gov/marc/authority/concise/ad008.html
|
195
|
+
field = get_fields('008').first
|
196
|
+
field008 = field.value
|
197
|
+
languages = []
|
198
|
+
languages.append('English') if ['b','e'].include? field008[8]
|
199
|
+
languages.append('French') if ['b','f'].include? field008[8]
|
200
|
+
rules = ''
|
201
|
+
rules = 'EARLIER' if field008[10] == 'a'
|
202
|
+
rules = 'AACR1' if field008[10] == 'b'
|
203
|
+
rules = 'AACR2' if field008[10] == 'c'
|
204
|
+
rules = 'AACR2 compatible' if field008[10] == 'd'
|
205
|
+
rules = 'OTHER' if field008[10] == 'z'
|
206
|
+
rules = 'N/A' if field008[10] == 'n'
|
207
|
+
# 32 - Undifferentiated personal name
|
208
|
+
# Whether the personal name in a name or name/title heading contained in field 100 in an established heading record or a reference record is used by one person or by two or more persons.
|
209
|
+
# a - Differentiated personal name
|
210
|
+
# Personal name in field 100 is a unique name.
|
211
|
+
# b - Undifferentiated personal name
|
212
|
+
# Personal name in field 100 is used by two or more persons.
|
213
|
+
# n - Not applicable
|
214
|
+
# 1XX heading is not a personal name or the personal name is a family name.
|
215
|
+
# | - No attempt to code
|
216
|
+
{
|
217
|
+
:date => Date.strptime(field008[0..5], "%y%m%d"),
|
218
|
+
:geographic_subdivision => field008[6], # '#', d, i, n, or '|'
|
219
|
+
:romanization_scheme => field008[7], # a..g, n, or '|'
|
220
|
+
:languages => languages,
|
221
|
+
:kind => field008[9], # a..g, or '|'
|
222
|
+
:rules => rules,
|
223
|
+
:heading_system => field008[11],
|
224
|
+
:series_type => field008[12],
|
225
|
+
:series_numbered => field008[13],
|
226
|
+
:use_1XX_for_7XX => field008[14] == 'a',
|
227
|
+
:use_1XX_for_6XX => field008[15] == 'a',
|
228
|
+
:use_1XX_for_4XX => field008[16] == 'a',
|
229
|
+
:use_1XX_for_8XX => field008[16] == 'a',
|
230
|
+
:type_subject_subdivision => field008[17],
|
231
|
+
# 18-27 - Undefined character positions
|
232
|
+
:type_government_agency => field008[28],
|
233
|
+
:reference_evaluation => field008[29],
|
234
|
+
# 30 - Undefined character position
|
235
|
+
:record_available => field008[31] == 'a',
|
236
|
+
# TODO: 32
|
237
|
+
# TODO: 33
|
238
|
+
# 34-37 - Undefined character positions
|
239
|
+
# TODO: 38
|
240
|
+
# TODO: 39
|
241
|
+
}
|
242
|
+
end
|
243
|
+
|
244
|
+
def parse_100
|
245
|
+
# http://www.loc.gov/marc/authority/concise/ad100.html
|
246
|
+
begin
|
247
|
+
# 100 is a personal name
|
248
|
+
field = get_fields('100').first
|
249
|
+
# field = @record.fields.select {|f| f if f.tag == '100' }.first
|
250
|
+
name = field.subfields.select {|f| f.code == 'a' }.first.value rescue ''
|
251
|
+
name.force_encoding('UTF-8')
|
252
|
+
rescue
|
253
|
+
'ERROR_PERSONAL_NAME'
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def parse_110
|
258
|
+
# http://www.loc.gov/marc/authority/concise/ad110.html
|
259
|
+
begin
|
260
|
+
# 110 is a corporate name
|
261
|
+
field = @record.fields.select {|f| f if f.tag == '110' }.first
|
262
|
+
a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
|
263
|
+
b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
|
264
|
+
c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
|
265
|
+
name = [a,b,c].flatten.join(' : ')
|
266
|
+
name.force_encoding('UTF-8')
|
267
|
+
rescue
|
268
|
+
'ERROR_CORPORATE_NAME'
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def parse_111
|
273
|
+
# http://www.loc.gov/marc/authority/concise/ad111.html
|
274
|
+
begin
|
275
|
+
# 111 is a meeting name
|
276
|
+
field = @record.fields.select {|f| f if f.tag == '111' }.first
|
277
|
+
a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
|
278
|
+
# TODO: incorporate additional subfields?
|
279
|
+
# b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
|
280
|
+
# c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
|
281
|
+
# name = [a,b,c].flatten.join(' : ')
|
282
|
+
# name.force_encoding('UTF-8')
|
283
|
+
a.force_encoding('UTF-8')
|
284
|
+
rescue
|
285
|
+
'ERROR_MEETING_NAME'
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def parse_151
|
290
|
+
# http://www.loc.gov/marc/authority/concise/ad151.html
|
291
|
+
begin
|
292
|
+
# 151 is a geographic name
|
293
|
+
field = @record.fields.select {|f| f if f.tag == '151' }.first
|
294
|
+
name = field.subfields.collect {|f| f.value if f.code == 'a' }.first rescue ''
|
295
|
+
name.force_encoding('UTF-8')
|
296
|
+
rescue
|
297
|
+
'ERROR_PLACE_NAME'
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def parse_loc_auth_name
|
302
|
+
#
|
303
|
+
# Create triples for various kinds of LOC authority.
|
304
|
+
# At present, this relies on LOC RDF to differentiate
|
305
|
+
# types of authorities. It should be possible to do this
|
306
|
+
# from the MARC directly, if @@config.get_loc is false.
|
307
|
+
#
|
308
|
+
# The MARC data differentiates them according to the tag number.
|
309
|
+
# The term 'name' refers to:
|
310
|
+
# X00 - Personal Name
|
311
|
+
# X10 - Corporate Name
|
312
|
+
# X11 - Meeting Name
|
313
|
+
# X30 - Uniform Title
|
314
|
+
# X51 - Jurisdiction / Geographic Name
|
315
|
+
#
|
316
|
+
@@config.logger.warn "LOC URL: #{@loc.iri} DEPRECATED" if @loc.deprecated?
|
317
|
+
name = ''
|
318
|
+
if @loc.conference?
|
319
|
+
# e.g. http://id.loc.gov/authorities/names/n79044866
|
320
|
+
name = @loc.label || parse_111
|
321
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.event)
|
322
|
+
elsif @loc.corporation?
|
323
|
+
name = @loc.label || parse_110
|
324
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Organization) if @@config.use_foaf
|
325
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Organization) if @@config.use_schema
|
326
|
+
elsif @loc.name_title?
|
327
|
+
# e.g. http://id.loc.gov/authorities/names/n79044934
|
328
|
+
# Skipping these, because the person entity should be in
|
329
|
+
# an additional record and we don't want the title content.
|
330
|
+
binding.pry if @@config.debug
|
331
|
+
return ''
|
332
|
+
elsif @loc.person?
|
333
|
+
name = @loc.label || parse_100
|
334
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Person) if @@config.use_foaf
|
335
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Person) if @@config.use_schema
|
336
|
+
# VIAF extracts first and last name, try to use them. Note
|
337
|
+
# that VIAF uses schema:name, schema:givenName, and schema:familyName.
|
338
|
+
if @@config.get_viaf && ! @viaf.nil?
|
339
|
+
@viaf.family_names.each do |n|
|
340
|
+
# ln = URI.encode(n)
|
341
|
+
# TODO: try to get a language type, if VIAF provide it.
|
342
|
+
# name = RDF::Literal.new(n, :language => :en)
|
343
|
+
ln = RDF::Literal.new(n)
|
344
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.familyName, ln) if @@config.use_foaf
|
345
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.familyName, ln) if @@config.use_schema
|
346
|
+
end
|
347
|
+
@viaf.given_names.each do |n|
|
348
|
+
# fn = URI.encode(n)
|
349
|
+
# TODO: try to get a language type, if VIAF provide it.
|
350
|
+
# name = RDF::Literal.new(n, :language => :en)
|
351
|
+
fn = RDF::Literal.new(n)
|
352
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.firstName, fn) if @@config.use_foaf
|
353
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.givenName, fn) if @@config.use_schema
|
354
|
+
end
|
355
|
+
end
|
356
|
+
elsif @loc.place?
|
357
|
+
# e.g. http://id.loc.gov/authorities/names/n79045127
|
358
|
+
name = @loc.label || parse_151
|
359
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Place)
|
360
|
+
else
|
361
|
+
# TODO: find out what type this is.
|
362
|
+
binding.pry if @@config.debug
|
363
|
+
name = @loc.label || ''
|
364
|
+
# Note: schema.org has no immediate parent for Person or Organization
|
365
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Agent) if @@config.use_foaf
|
366
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Thing) if @@config.use_schema
|
367
|
+
end
|
368
|
+
if name != ''
|
369
|
+
# name_encoding = URI.encode(name)
|
370
|
+
name = RDF::Literal.new(name)
|
371
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.name, name) if @@config.use_foaf
|
372
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.name, name) if @@config.use_schema
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def parse_loc_auth_subject
|
377
|
+
# TODO: what to do with subjects?
|
378
|
+
binding.pry if @@config.debug
|
379
|
+
# The term 'subject' refers to:
|
380
|
+
# X30 - Uniform Titles
|
381
|
+
# X48 - Chronological Terms
|
382
|
+
# X50 - Topical Terms
|
383
|
+
# X51 - Geographic Names
|
384
|
+
# X55 - Genre/Form Terms
|
385
|
+
#
|
386
|
+
# The term 'subject subdivision' refers to:
|
387
|
+
# X80 - general subdivision terms
|
388
|
+
# X81 - geographic subdivision names
|
389
|
+
# X82 - chronological subdivision terms
|
390
|
+
# X85 - form subdivision terms
|
391
|
+
end
|
392
|
+
|
393
|
+
def get_oclc_links
|
394
|
+
oclc_iri = nil
|
395
|
+
begin
|
396
|
+
# Try to get OCLC using LOC ID.
|
397
|
+
oclc_iri = @loc.get_oclc_identity
|
398
|
+
rescue
|
399
|
+
# Try to get OCLC using 035a field data, but
|
400
|
+
# this is not as reliable/accurate as LOC.
|
401
|
+
oclc_iri = get_iri4oclc
|
402
|
+
end
|
403
|
+
unless oclc_iri.nil?
|
404
|
+
# Try to get additional data from OCLC, using the RDFa
|
405
|
+
# available in the OCLC identities pages.
|
406
|
+
oclc_auth = OclcIdentity.new oclc_iri
|
407
|
+
@graph.insert RDF::Statement(@loc.rdf_uri, RDF::OWL.sameAs, oclc_auth.rdf_uri)
|
408
|
+
oclc_auth.creative_works.each do |creative_work_uri|
|
409
|
+
# Notes on work-around for OCLC data inconsistency:
|
410
|
+
# RDFa for http://www.worldcat.org/identities/lccn-n79044798 contains:
|
411
|
+
# <http://worldcat.org/oclc/747413718> a <http://schema.org/CreativeWork> .
|
412
|
+
# However, the RDF for <http://worldcat.org/oclc/747413718> contains:
|
413
|
+
# <http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://worldcat.org/entity/work/id/994448191> .
|
414
|
+
# Note how the subject here is 'WWW.worldcat.org' instead of 'worldcat.org'.
|
415
|
+
#creative_work_iri = creative_work.to_s.gsub('worldcat.org','www.worldcat.org')
|
416
|
+
#creative_work_iri = creative_work_iri.gsub('wwwwww','www') # in case it gets added already by OCLC
|
417
|
+
creative_work = OclcCreativeWork.new creative_work_uri
|
418
|
+
@graph.insert RDF::Statement(oclc_auth.rdf_uri, RDF::RDFS.seeAlso, creative_work.rdf_uri)
|
419
|
+
if @@config.oclc_auth2works
|
420
|
+
# Try to use VIAF to relate auth to work as creator, contributor, editor, etc.
|
421
|
+
# Note that this requires additional RDF retrieval for each work (slower processing).
|
422
|
+
unless @viaf.nil?
|
423
|
+
if creative_work.creator? @viaf.iri
|
424
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.creator, oclc_auth.rdf_uri)
|
425
|
+
elsif creative_work.contributor? @viaf.iri
|
426
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.contributor, oclc_auth.rdf_uri)
|
427
|
+
elsif creative_work.editor? @viaf.iri
|
428
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.editor, oclc_auth.rdf_uri)
|
429
|
+
end
|
430
|
+
end
|
431
|
+
# TODO: Is auth the subject of the work (as in biography) or both (as in autobiography).
|
432
|
+
# binding.pry if @@config.debug
|
433
|
+
# binding.pry if creative_work.iri.to_s == 'http://www.worldcat.org/oclc/006626542'
|
434
|
+
# Try to find the generic work entity for this example work.
|
435
|
+
creative_work.get_works.each do |oclc_work_uri|
|
436
|
+
oclc_work = OclcWork.new oclc_work_uri
|
437
|
+
@graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.exampleOfWork, oclc_work.rdf_uri)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
# TODO: use an 'affiliation' entry, maybe 373? (optional field)
|
445
|
+
|
446
|
+
def to_ttl
|
447
|
+
graph.to_ttl
|
448
|
+
end
|
449
|
+
|
450
|
+
def graph
|
451
|
+
# TODO: figure out how to specify all the graph prefixes.
|
452
|
+
return @graph unless @graph.empty?
|
453
|
+
@lib = LibAuth.new get_iri4lib
|
454
|
+
# Try to find LOC, VIAF, and ISNI IRIs in the MARC record
|
455
|
+
@loc = Loc.new get_iri4loc rescue nil
|
456
|
+
# Try to identify problems in getting an LOC IRI.
|
457
|
+
binding.pry if (@@config.debug && @loc.nil?)
|
458
|
+
# might require LOC to get ISNI.
|
459
|
+
@viaf = Viaf.new get_iri4viaf rescue nil
|
460
|
+
# might require VIAF to get ISNI.
|
461
|
+
@isni = Isni.new get_iri4isni rescue nil
|
462
|
+
|
463
|
+
# TODO: ORCID? VIVO? VITRO? Stanford CAP?
|
464
|
+
|
465
|
+
# Get LOC control number and add catalog permalink? e.g.
|
466
|
+
# http://lccn.loc.gov/n79046291
|
467
|
+
|
468
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @loc.rdf_uri)
|
469
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @viaf.rdf_uri) unless @viaf.nil?
|
470
|
+
@graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @isni.rdf_uri) unless @isni.nil?
|
471
|
+
return @graph unless @@config.get_loc
|
472
|
+
|
473
|
+
# TODO: find codes in the marc record to differentiate the authority into
|
474
|
+
# TODO: person, organization, event, etc. without getting LOC RDF.
|
475
|
+
|
476
|
+
if @loc.iri.to_s =~ /name/
|
477
|
+
parse_loc_auth_name
|
478
|
+
elsif @loc.iri.to_s =~ /subjects/
|
479
|
+
parse_loc_auth_subject
|
480
|
+
else
|
481
|
+
binding.pry if @@config.debug
|
482
|
+
end
|
483
|
+
# Optional elaboration of authority data with OCLC identity and works.
|
484
|
+
get_oclc_links if @@config.get_oclc
|
485
|
+
|
486
|
+
@@config.logger.info "Extracted #{@loc.id}"
|
487
|
+
@graph
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
end
|
492
|
+
|
@@ -0,0 +1,175 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'marc'
|
4
|
+
require 'linkeddata'
|
5
|
+
require 'pry'
|
6
|
+
|
7
|
+
#EXAMPLE_RECORD_FILE='../marc/catalog/stf.00.mrc'
|
8
|
+
EXAMPLE_RECORD_FILE='../marc/catalog/stf.51.mrc'
|
9
|
+
|
10
|
+
## reading records from a batch file
|
11
|
+
#reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "MARC-8")
|
12
|
+
#reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "UTF-8", :validate_encoding => true)
|
13
|
+
|
14
|
+
#reader = MARC::ForgivingReader.new(EXAMPLE_RECORD_FILE)
|
15
|
+
|
16
|
+
handle = File.new(EXAMPLE_RECORD_FILE)
|
17
|
+
#=> #<File:marc/authority/stf_auth.00.mrc>
|
18
|
+
rec_length = handle.read(5).to_i
|
19
|
+
#=> 774
|
20
|
+
handle.rewind
|
21
|
+
raw = handle.read(rec_length)
|
22
|
+
record = MARC::Reader.decode(raw)
|
23
|
+
|
24
|
+
# From http://www.loc.gov/marc/authority/adleader.html
|
25
|
+
# System-Generated Elements - The following Leader elements are usually system generated:
|
26
|
+
#
|
27
|
+
# 00-04 Logical record length
|
28
|
+
#
|
29
|
+
# 05 - Record status:
|
30
|
+
# a - Increase in encoding level
|
31
|
+
# c - Corrected or revised
|
32
|
+
# d - Deleted
|
33
|
+
# n - New
|
34
|
+
# o - Obsolete
|
35
|
+
# s - Deleted; heading split into two or more headings
|
36
|
+
# x - Deleted; heading replaced by another heading
|
37
|
+
#
|
38
|
+
# 06 - Type of record
|
39
|
+
# z - Authority data
|
40
|
+
#
|
41
|
+
# 07-08 Undefined character positions
|
42
|
+
#
|
43
|
+
# 09 - Character coding scheme
|
44
|
+
# # - MARC-8
|
45
|
+
# a - UCS/Unicode
|
46
|
+
#
|
47
|
+
# 10 Indicator count
|
48
|
+
# 2 - Number of character positions used for indicators
|
49
|
+
#
|
50
|
+
# 11 Subfield code count
|
51
|
+
# 2 - Number of character positions used for a subfield code
|
52
|
+
#
|
53
|
+
# 12-16 Base address of data
|
54
|
+
# [number] - Length of Leader and Directory
|
55
|
+
#
|
56
|
+
# 17 - Encoding level
|
57
|
+
# n - Complete authority record
|
58
|
+
# o - Incomplete authority record
|
59
|
+
#
|
60
|
+
# 20-23 Entry map
|
61
|
+
#
|
62
|
+
# 18-19 - Undefined character positions
|
63
|
+
#
|
64
|
+
# 20 - Length of the length-of-field portion
|
65
|
+
# 4 - Number of characters in the length-of-field portion of a Directory entry
|
66
|
+
#
|
67
|
+
# 21 - Length of the starting-character-position portion
|
68
|
+
# 5 - Number of characters in the starting-character-position portion of a Directory entry
|
69
|
+
#
|
70
|
+
# 22 - Length of the implementation-defined portion
|
71
|
+
# 0 - Number of characters in the implementation-defined portion of a Directory entry
|
72
|
+
#
|
73
|
+
# It is common for default values in other Leader elements to be generated automatically as well.
|
74
|
+
# Capitalization - Alphabetic codes are input as lower case letters.
|
75
|
+
#
|
76
|
+
# example:
|
77
|
+
#record.leader
|
78
|
+
#=> "00774cz a2200253n 4500"
|
79
|
+
# 00-04: '00774' - record length
|
80
|
+
# 05: 'c' - corrected or revised
|
81
|
+
# 06: 'z' - always 'z' for authority records
|
82
|
+
# 09: 'a' - UCS/Unicode
|
83
|
+
# 12-16: '00253' - base address of data, Length of Leader and Directory
|
84
|
+
# 17: 'n' - Complete authority record
|
85
|
+
LEADER_STATUS_CODES = {
|
86
|
+
'a' => 'Increase in encoding level',
|
87
|
+
'c' => 'Corrected or revised',
|
88
|
+
'd' => 'Deleted',
|
89
|
+
'n' => 'New',
|
90
|
+
'o' => 'Obsolete',
|
91
|
+
's' => 'Deleted; heading split into two or more headings',
|
92
|
+
'x' => 'Deleted; heading replaced by another heading'
|
93
|
+
}
|
94
|
+
def leader_parse(record)
|
95
|
+
leader = {
|
96
|
+
:length => record.leader[0..4].to_i,
|
97
|
+
:status => record.leader[5], # LEADER_STATUS_CODES[ record.leader[5] ]
|
98
|
+
:type => record.leader[6], # always 'z' for authority records
|
99
|
+
:encoding => record.leader[9], # TODO: translate letter code into ruby encoding string
|
100
|
+
:data_address => record.leader[12..16].to_i,
|
101
|
+
:complete => record.leader[17].include?('n')
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# Stanford Resource keys and Ckeys can collide. They are only unique within their own set.
|
107
|
+
#
|
108
|
+
# When I do a catalogdump for ckey 6809804 I see:
|
109
|
+
#
|
110
|
+
# .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
|
111
|
+
#
|
112
|
+
# When we do a catalogdump for searchworks we filter the results to only export
|
113
|
+
# records with holdings, and not those things which are on order or "shadowed"
|
114
|
+
# i.e. hidden from public view, although we still have the bibliographic data in
|
115
|
+
# the database. When I extracted the records for conversion I selected all of
|
116
|
+
# them.
|
117
|
+
#
|
118
|
+
# - Josh
|
119
|
+
|
120
|
+
|
121
|
+
# Create SUL LOD...
|
122
|
+
SUL_URI = RDF::URI.new('http://linked-data.stanford.edu/library/')
|
123
|
+
|
124
|
+
# extract catalog key from field 001 (use the first one)
|
125
|
+
field001 = record.fields.select {|f| f if f.tag == '001' }.first
|
126
|
+
cat_key = field001.value.strip
|
127
|
+
CAT_URI = SUL_URI.join("catalog/#{cat_key}")
|
128
|
+
|
129
|
+
# TODO: Evaluate whether cat_key is in SearchWorks, e.g.
|
130
|
+
# "http://searchworks.stanford.edu/catalog/#{cat_key}"
|
131
|
+
# http://searchworks.stanford.edu/catalog/7106054
|
132
|
+
|
133
|
+
# TODO: extract 035a for OCLC master control number.
|
134
|
+
# TODO: map the OCLC to the OCLC work number.
|
135
|
+
field035 = record.fields.select {|f| f if f.tag == '035' }
|
136
|
+
|
137
|
+
|
138
|
+
binding.pry
|
139
|
+
exit!
|
140
|
+
|
141
|
+
|
142
|
+
#There is nothing in the MARC record itself to indicate that a holding is
|
143
|
+
#'shadowed' (not available for public view), but one idea to handle them is
|
144
|
+
#to supply a list of shadowed ckeys and that list could easily be transformed
|
145
|
+
#
|
146
|
+
#into a list of triples like this:
|
147
|
+
# <http://linked-data.stanford.edu/library/catalog/{cKey}> <rdf:Property> <http://linked-data.stanford.edu/library/catalog/isShadowed>
|
148
|
+
#...or what ever predicate and object you want to use. Then you can load those into the triple store.
|
149
|
+
|
150
|
+
# .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
|
151
|
+
# When we do a catalogdump for searchworks we filter the results to only export
|
152
|
+
# records with holdings, and not those things which are on order or "shadowed"
|
153
|
+
# i.e. hidden from public view, although we still have the bibliographic data in
|
154
|
+
# the database. When I extracted the records for conversion I selected all of
|
155
|
+
# them.
|
156
|
+
field948 = record.fields.select {|f| f if f.tag == '948' }
|
157
|
+
holdings = field948.first.value
|
158
|
+
|
159
|
+
|
160
|
+
# TODO: construct RDF model, see http://blog.datagraph.org/2010/03/rdf-for-ruby
|
161
|
+
# RDF::Literal.new("Hello!", :language => :en)
|
162
|
+
#
|
163
|
+
lod = {
|
164
|
+
:id => cat_uri,
|
165
|
+
:oclc => oclc4loc.collect {|uri| RDF::URI.new(uri) },
|
166
|
+
}
|
167
|
+
|
168
|
+
binding.pry
|
169
|
+
exit!
|
170
|
+
|
171
|
+
#for record in reader
|
172
|
+
# # print out field 245 subfield a
|
173
|
+
# puts record['245']['a']
|
174
|
+
#end
|
175
|
+
|