marc2linkeddata 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,492 @@
1
+
2
+ # Marc21 Authority fields are documented at
3
+ # http://www.loc.gov/marc/authority/ecadlist.html
4
+ # http://www.loc.gov/marc/authority/ecadhome.html
5
+
6
+ module Marc2LinkedData
7
+
8
+ class ParseMarcAuthority
9
+
10
+ # TODO: provide iterator pattern on an entire file of records.
11
+ # @leader = ParseMarcAuthority::parse_leader(marc_file)
12
+ # raw = marc_file.read(@leader[:length])
13
+ # @record = MARC::Reader.decode(raw)
14
+
15
+ @@config = nil
16
+
17
+ attr_reader :loc
18
+ attr_reader :isni
19
+ attr_reader :viaf
20
+
21
+ def initialize(record)
22
+ @@config ||= Marc2LinkedData.configuration
23
+ @record = record
24
+ @graph = RDF::Graph.new
25
+ @loc = nil
26
+ @isni = nil
27
+ @viaf = nil
28
+ end
29
+
30
+ def get_fields(field_num)
31
+ fields = @record.fields.select {|f| f if f.tag == field_num }
32
+ raise "Invalid data in field #{field_num}" if fields.length < 1
33
+ fields
34
+ end
35
+
36
+ # Try to use the SUL catkey and/or the OCLC control numbers, maybe SUL
37
+ # catkey in the record IRI
38
+ def get_id
39
+ # extract ID from control numbers, see
40
+ # http://www.loc.gov/marc/authority/ad001.html
41
+ #field001 = record.fields.select {|f| f if f.tag == '001' }.first.value
42
+ #field003 = record.fields.select {|f| f if f.tag == '003' }.first.value
43
+ #"#{field003}-#{field001}"
44
+ get_fields('001').first.value
45
+ end
46
+
47
+ def get_iri(field, iri_pattern)
48
+ begin
49
+ iris = field.subfields.collect {|f| f.value if f.value.include? iri_pattern }
50
+ iris.first || nil
51
+ rescue
52
+ nil
53
+ end
54
+ end
55
+
56
+ def get_iri4isni
57
+ isni_iri = nil
58
+ begin
59
+ # e.g. http://www.isni.org/0000000109311081
60
+ field = get_fields(@@config.field_auth_isni).first
61
+ isni_iri = get_iri(field, 'isni.org')
62
+ # If ISNI is not already in the MARC record, try to get it from VIAF.
63
+ if isni_iri.nil? && @@config.get_isni
64
+ isni_iri = @viaf.get_isni rescue nil
65
+ @@config.logger.debug 'Failed to resolve ISNI URI' if isni_iri.nil?
66
+ # binding.pry if @viaf.iri.to_s.include? '67737121' #@@config.debug
67
+ end
68
+ unless isni_iri.nil?
69
+ # Ensure the ISNI IRI has this prefix: http://www.isni.org/isni/
70
+ isni_iri.gsub('www.isni.org', 'www.isni.org/isni') unless isni_iri =~ /www\.isni\.org\/isni\//
71
+ end
72
+ return isni_iri
73
+ rescue
74
+ nil
75
+ end
76
+ end
77
+
78
+ def get_iri4lib
79
+ "#{@@config.prefixes['lib_auth']}#{get_id}"
80
+ end
81
+
82
+ def get_iri4loc
83
+ loc_iri = nil
84
+ begin
85
+ # e.g. http://id.loc.gov/authorities/names/n42000906
86
+ field = get_fields(@@config.field_auth_loc).first
87
+ loc_iri = get_iri(field, 'id.loc.gov')
88
+ rescue
89
+ end
90
+ begin
91
+ if loc_iri.nil?
92
+ # If the LOC is not in the marc record, try to determine the LOC IRI from the ID.
93
+ loc_id = get_id
94
+ if loc_id =~ /^n/i
95
+ loc_iri = "#{@@config.prefixes['loc_names']}#{loc_id.downcase}"
96
+ end
97
+ if loc_id =~ /^sh/i
98
+ loc_iri = "#{@@config.prefixes['loc_subjects']}#{loc_id.downcase}"
99
+ end
100
+ unless loc_iri.nil?
101
+ # Verify the URL (used HEAD so it's as fast as possible)
102
+ @@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
103
+ res = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
104
+ case res.code
105
+ when '200'
106
+ # it's good to go
107
+ when '301'
108
+ # use the redirection
109
+ loc_iri = res['location']
110
+ when '302','303'
111
+ #302 Moved Temporarily
112
+ #303 See Other
113
+ # Use the current URL, most get requests will follow a 302 or 303
114
+ else
115
+ loc_iri = nil
116
+ end
117
+ end
118
+ if loc_iri.nil?
119
+ # If it gets here, it's a problem.
120
+ binding.pry if @@config.debug
121
+ @@config.logger.error 'FAILURE to resolve LOC IRI'
122
+ else
123
+ @@config.logger.debug "DISCOVERED LOC IRI: #{loc_iri}"
124
+ end
125
+ else
126
+ @@config.logger.debug "MARC contains LOC IRI: #{loc_iri}"
127
+ end
128
+ return loc_iri
129
+ rescue
130
+ nil
131
+ end
132
+ end
133
+
134
+ def get_iri4oclc
135
+ begin
136
+ field = get_fields(@@config.field_auth_oclc).first
137
+ oclc_cn = field.subfields.collect {|f| f.value if f.code == 'a'}.first
138
+ oclc_id = /\d+$/.match(oclc_cn).to_s
139
+ oclc_id.empty? ? nil : "http://www.worldcat.org/oclc/#{oclc_id}"
140
+ rescue
141
+ nil
142
+ end
143
+ end
144
+
145
+ def get_iri4viaf
146
+ begin
147
+ # e.g. http://viaf.org/viaf/181829329
148
+ # VIAF RSS feed for changes, e.g. http://viaf.org/viaf/181829329.rss
149
+ field = get_fields(@@config.field_auth_viaf).first
150
+ viaf_iri = get_iri(field, 'viaf.org')
151
+ # If VIAF is not already in the MARC record, try to get from LOC.
152
+ if viaf_iri.nil? && @@config.get_viaf
153
+ viaf_iri = @loc.get_viaf rescue nil
154
+ @@config.logger.debug 'Failed to resolve VIAF URI' if viaf_iri.nil?
155
+ end
156
+ return viaf_iri
157
+ rescue
158
+ nil
159
+ end
160
+ end
161
+
162
+ def self.parse_leader(file_handle, leader_bytes=24)
163
+ # example:
164
+ #record.leader
165
+ #=> "00774cz a2200253n 4500"
166
+ # 00-04: '00774' - record length
167
+ # 05: 'c' - corrected or revised
168
+ # 06: 'z' - always 'z' for authority records
169
+ # 09: 'a' - UCS/Unicode
170
+ # 12-16: '00253' - base address of data, Length of Leader and Directory
171
+ # 17: 'n' - Complete authority record
172
+ # leader_status_codes = {
173
+ # 'a' => 'Increase in encoding level',
174
+ # 'c' => 'Corrected or revised',
175
+ # 'd' => 'Deleted',
176
+ # 'n' => 'New',
177
+ # 'o' => 'Obsolete',
178
+ # 's' => 'Deleted; heading split into two or more headings',
179
+ # 'x' => 'Deleted; heading replaced by another heading'
180
+ # }
181
+ leader = file_handle.read(leader_bytes)
182
+ file_handle.seek(-1 * leader_bytes, IO::SEEK_CUR)
183
+ {
184
+ :length => leader[0..4].to_i,
185
+ :status => leader[5], # leader_status_codes[ record.leader[5] ]
186
+ :type => leader[6], # always 'z' for authority records
187
+ :encoding => leader[9], # translate letter code into ruby encoding string
188
+ :data_address => leader[12..16].to_i,
189
+ :complete => leader[17].include?('n')
190
+ }
191
+ end
192
+
193
+ def parse_008
194
+ # http://www.loc.gov/marc/authority/concise/ad008.html
195
+ field = get_fields('008').first
196
+ field008 = field.value
197
+ languages = []
198
+ languages.append('English') if ['b','e'].include? field008[8]
199
+ languages.append('French') if ['b','f'].include? field008[8]
200
+ rules = ''
201
+ rules = 'EARLIER' if field008[10] == 'a'
202
+ rules = 'AACR1' if field008[10] == 'b'
203
+ rules = 'AACR2' if field008[10] == 'c'
204
+ rules = 'AACR2 compatible' if field008[10] == 'd'
205
+ rules = 'OTHER' if field008[10] == 'z'
206
+ rules = 'N/A' if field008[10] == 'n'
207
+ # 32 - Undifferentiated personal name
208
+ # Whether the personal name in a name or name/title heading contained in field 100 in an established heading record or a reference record is used by one person or by two or more persons.
209
+ # a - Differentiated personal name
210
+ # Personal name in field 100 is a unique name.
211
+ # b - Undifferentiated personal name
212
+ # Personal name in field 100 is used by two or more persons.
213
+ # n - Not applicable
214
+ # 1XX heading is not a personal name or the personal name is a family name.
215
+ # | - No attempt to code
216
+ {
217
+ :date => Date.strptime(field008[0..5], "%y%m%d"),
218
+ :geographic_subdivision => field008[6], # '#', d, i, n, or '|'
219
+ :romanization_scheme => field008[7], # a..g, n, or '|'
220
+ :languages => languages,
221
+ :kind => field008[9], # a..g, or '|'
222
+ :rules => rules,
223
+ :heading_system => field008[11],
224
+ :series_type => field008[12],
225
+ :series_numbered => field008[13],
226
+ :use_1XX_for_7XX => field008[14] == 'a',
227
+ :use_1XX_for_6XX => field008[15] == 'a',
228
+ :use_1XX_for_4XX => field008[16] == 'a',
229
+ :use_1XX_for_8XX => field008[16] == 'a',
230
+ :type_subject_subdivision => field008[17],
231
+ # 18-27 - Undefined character positions
232
+ :type_government_agency => field008[28],
233
+ :reference_evaluation => field008[29],
234
+ # 30 - Undefined character position
235
+ :record_available => field008[31] == 'a',
236
+ # TODO: 32
237
+ # TODO: 33
238
+ # 34-37 - Undefined character positions
239
+ # TODO: 38
240
+ # TODO: 39
241
+ }
242
+ end
243
+
244
+ def parse_100
245
+ # http://www.loc.gov/marc/authority/concise/ad100.html
246
+ begin
247
+ # 100 is a personal name
248
+ field = get_fields('100').first
249
+ # field = @record.fields.select {|f| f if f.tag == '100' }.first
250
+ name = field.subfields.select {|f| f.code == 'a' }.first.value rescue ''
251
+ name.force_encoding('UTF-8')
252
+ rescue
253
+ 'ERROR_PERSONAL_NAME'
254
+ end
255
+ end
256
+
257
+ def parse_110
258
+ # http://www.loc.gov/marc/authority/concise/ad110.html
259
+ begin
260
+ # 110 is a corporate name
261
+ field = @record.fields.select {|f| f if f.tag == '110' }.first
262
+ a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
263
+ b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
264
+ c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
265
+ name = [a,b,c].flatten.join(' : ')
266
+ name.force_encoding('UTF-8')
267
+ rescue
268
+ 'ERROR_CORPORATE_NAME'
269
+ end
270
+ end
271
+
272
+ def parse_111
273
+ # http://www.loc.gov/marc/authority/concise/ad111.html
274
+ begin
275
+ # 111 is a meeting name
276
+ field = @record.fields.select {|f| f if f.tag == '111' }.first
277
+ a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
278
+ # TODO: incorporate additional subfields?
279
+ # b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
280
+ # c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
281
+ # name = [a,b,c].flatten.join(' : ')
282
+ # name.force_encoding('UTF-8')
283
+ a.force_encoding('UTF-8')
284
+ rescue
285
+ 'ERROR_MEETING_NAME'
286
+ end
287
+ end
288
+
289
+ def parse_151
290
+ # http://www.loc.gov/marc/authority/concise/ad151.html
291
+ begin
292
+ # 151 is a geographic name
293
+ field = @record.fields.select {|f| f if f.tag == '151' }.first
294
+ name = field.subfields.collect {|f| f.value if f.code == 'a' }.first rescue ''
295
+ name.force_encoding('UTF-8')
296
+ rescue
297
+ 'ERROR_PLACE_NAME'
298
+ end
299
+ end
300
+
301
+ def parse_loc_auth_name
302
+ #
303
+ # Create triples for various kinds of LOC authority.
304
+ # At present, this relies on LOC RDF to differentiate
305
+ # types of authorities. It should be possible to do this
306
+ # from the MARC directly, if @@config.get_loc is false.
307
+ #
308
+ # The MARC data differentiates them according to the tag number.
309
+ # The term 'name' refers to:
310
+ # X00 - Personal Name
311
+ # X10 - Corporate Name
312
+ # X11 - Meeting Name
313
+ # X30 - Uniform Title
314
+ # X51 - Jurisdiction / Geographic Name
315
+ #
316
+ @@config.logger.warn "LOC URL: #{@loc.iri} DEPRECATED" if @loc.deprecated?
317
+ name = ''
318
+ if @loc.conference?
319
+ # e.g. http://id.loc.gov/authorities/names/n79044866
320
+ name = @loc.label || parse_111
321
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.event)
322
+ elsif @loc.corporation?
323
+ name = @loc.label || parse_110
324
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Organization) if @@config.use_foaf
325
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Organization) if @@config.use_schema
326
+ elsif @loc.name_title?
327
+ # e.g. http://id.loc.gov/authorities/names/n79044934
328
+ # Skipping these, because the person entity should be in
329
+ # an additional record and we don't want the title content.
330
+ binding.pry if @@config.debug
331
+ return ''
332
+ elsif @loc.person?
333
+ name = @loc.label || parse_100
334
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Person) if @@config.use_foaf
335
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Person) if @@config.use_schema
336
+ # VIAF extracts first and last name, try to use them. Note
337
+ # that VIAF uses schema:name, schema:givenName, and schema:familyName.
338
+ if @@config.get_viaf && ! @viaf.nil?
339
+ @viaf.family_names.each do |n|
340
+ # ln = URI.encode(n)
341
+ # TODO: try to get a language type, if VIAF provide it.
342
+ # name = RDF::Literal.new(n, :language => :en)
343
+ ln = RDF::Literal.new(n)
344
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.familyName, ln) if @@config.use_foaf
345
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.familyName, ln) if @@config.use_schema
346
+ end
347
+ @viaf.given_names.each do |n|
348
+ # fn = URI.encode(n)
349
+ # TODO: try to get a language type, if VIAF provide it.
350
+ # name = RDF::Literal.new(n, :language => :en)
351
+ fn = RDF::Literal.new(n)
352
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.firstName, fn) if @@config.use_foaf
353
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.givenName, fn) if @@config.use_schema
354
+ end
355
+ end
356
+ elsif @loc.place?
357
+ # e.g. http://id.loc.gov/authorities/names/n79045127
358
+ name = @loc.label || parse_151
359
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Place)
360
+ else
361
+ # TODO: find out what type this is.
362
+ binding.pry if @@config.debug
363
+ name = @loc.label || ''
364
+ # Note: schema.org has no immediate parent for Person or Organization
365
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Agent) if @@config.use_foaf
366
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Thing) if @@config.use_schema
367
+ end
368
+ if name != ''
369
+ # name_encoding = URI.encode(name)
370
+ name = RDF::Literal.new(name)
371
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.name, name) if @@config.use_foaf
372
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.name, name) if @@config.use_schema
373
+ end
374
+ end
375
+
376
+ def parse_loc_auth_subject
377
+ # TODO: what to do with subjects?
378
+ binding.pry if @@config.debug
379
+ # The term 'subject' refers to:
380
+ # X30 - Uniform Titles
381
+ # X48 - Chronological Terms
382
+ # X50 - Topical Terms
383
+ # X51 - Geographic Names
384
+ # X55 - Genre/Form Terms
385
+ #
386
+ # The term 'subject subdivision' refers to:
387
+ # X80 - general subdivision terms
388
+ # X81 - geographic subdivision names
389
+ # X82 - chronological subdivision terms
390
+ # X85 - form subdivision terms
391
+ end
392
+
393
+ def get_oclc_links
394
+ oclc_iri = nil
395
+ begin
396
+ # Try to get OCLC using LOC ID.
397
+ oclc_iri = @loc.get_oclc_identity
398
+ rescue
399
+ # Try to get OCLC using 035a field data, but
400
+ # this is not as reliable/accurate as LOC.
401
+ oclc_iri = get_iri4oclc
402
+ end
403
+ unless oclc_iri.nil?
404
+ # Try to get additional data from OCLC, using the RDFa
405
+ # available in the OCLC identities pages.
406
+ oclc_auth = OclcIdentity.new oclc_iri
407
+ @graph.insert RDF::Statement(@loc.rdf_uri, RDF::OWL.sameAs, oclc_auth.rdf_uri)
408
+ oclc_auth.creative_works.each do |creative_work_uri|
409
+ # Notes on work-around for OCLC data inconsistency:
410
+ # RDFa for http://www.worldcat.org/identities/lccn-n79044798 contains:
411
+ # <http://worldcat.org/oclc/747413718> a <http://schema.org/CreativeWork> .
412
+ # However, the RDF for <http://worldcat.org/oclc/747413718> contains:
413
+ # <http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://worldcat.org/entity/work/id/994448191> .
414
+ # Note how the subject here is 'WWW.worldcat.org' instead of 'worldcat.org'.
415
+ #creative_work_iri = creative_work.to_s.gsub('worldcat.org','www.worldcat.org')
416
+ #creative_work_iri = creative_work_iri.gsub('wwwwww','www') # in case it gets added already by OCLC
417
+ creative_work = OclcCreativeWork.new creative_work_uri
418
+ @graph.insert RDF::Statement(oclc_auth.rdf_uri, RDF::RDFS.seeAlso, creative_work.rdf_uri)
419
+ if @@config.oclc_auth2works
420
+ # Try to use VIAF to relate auth to work as creator, contributor, editor, etc.
421
+ # Note that this requires additional RDF retrieval for each work (slower processing).
422
+ unless @viaf.nil?
423
+ if creative_work.creator? @viaf.iri
424
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.creator, oclc_auth.rdf_uri)
425
+ elsif creative_work.contributor? @viaf.iri
426
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.contributor, oclc_auth.rdf_uri)
427
+ elsif creative_work.editor? @viaf.iri
428
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.editor, oclc_auth.rdf_uri)
429
+ end
430
+ end
431
+ # TODO: Is auth the subject of the work (as in biography) or both (as in autobiography).
432
+ # binding.pry if @@config.debug
433
+ # binding.pry if creative_work.iri.to_s == 'http://www.worldcat.org/oclc/006626542'
434
+ # Try to find the generic work entity for this example work.
435
+ creative_work.get_works.each do |oclc_work_uri|
436
+ oclc_work = OclcWork.new oclc_work_uri
437
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.exampleOfWork, oclc_work.rdf_uri)
438
+ end
439
+ end
440
+ end
441
+ end
442
+ end
443
+
444
+ # TODO: use an 'affiliation' entry, maybe 373? (optional field)
445
+
446
+ def to_ttl
447
+ graph.to_ttl
448
+ end
449
+
450
+ def graph
451
+ # TODO: figure out how to specify all the graph prefixes.
452
+ return @graph unless @graph.empty?
453
+ @lib = LibAuth.new get_iri4lib
454
+ # Try to find LOC, VIAF, and ISNI IRIs in the MARC record
455
+ @loc = Loc.new get_iri4loc rescue nil
456
+ # Try to identify problems in getting an LOC IRI.
457
+ binding.pry if (@@config.debug && @loc.nil?)
458
+ # might require LOC to get ISNI.
459
+ @viaf = Viaf.new get_iri4viaf rescue nil
460
+ # might require VIAF to get ISNI.
461
+ @isni = Isni.new get_iri4isni rescue nil
462
+
463
+ # TODO: ORCID? VIVO? VITRO? Stanford CAP?
464
+
465
+ # Get LOC control number and add catalog permalink? e.g.
466
+ # http://lccn.loc.gov/n79046291
467
+
468
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @loc.rdf_uri)
469
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @viaf.rdf_uri) unless @viaf.nil?
470
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @isni.rdf_uri) unless @isni.nil?
471
+ return @graph unless @@config.get_loc
472
+
473
+ # TODO: find codes in the marc record to differentiate the authority into
474
+ # TODO: person, organization, event, etc. without getting LOC RDF.
475
+
476
+ if @loc.iri.to_s =~ /name/
477
+ parse_loc_auth_name
478
+ elsif @loc.iri.to_s =~ /subjects/
479
+ parse_loc_auth_subject
480
+ else
481
+ binding.pry if @@config.debug
482
+ end
483
+ # Optional elaboration of authority data with OCLC identity and works.
484
+ get_oclc_links if @@config.get_oclc
485
+
486
+ @@config.logger.info "Extracted #{@loc.id}"
487
+ @graph
488
+ end
489
+ end
490
+
491
+ end
492
+
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'marc'
4
+ require 'linkeddata'
5
+ require 'pry'
6
+
7
+ #EXAMPLE_RECORD_FILE='../marc/catalog/stf.00.mrc'
8
+ EXAMPLE_RECORD_FILE='../marc/catalog/stf.51.mrc'
9
+
10
+ ## reading records from a batch file
11
+ #reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "MARC-8")
12
+ #reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "UTF-8", :validate_encoding => true)
13
+
14
+ #reader = MARC::ForgivingReader.new(EXAMPLE_RECORD_FILE)
15
+
16
+ handle = File.new(EXAMPLE_RECORD_FILE)
17
+ #=> #<File:marc/authority/stf_auth.00.mrc>
18
+ rec_length = handle.read(5).to_i
19
+ #=> 774
20
+ handle.rewind
21
+ raw = handle.read(rec_length)
22
+ record = MARC::Reader.decode(raw)
23
+
24
+ # From http://www.loc.gov/marc/authority/adleader.html
25
+ # System-Generated Elements - The following Leader elements are usually system generated:
26
+ #
27
+ # 00-04 Logical record length
28
+ #
29
+ # 05 - Record status:
30
+ # a - Increase in encoding level
31
+ # c - Corrected or revised
32
+ # d - Deleted
33
+ # n - New
34
+ # o - Obsolete
35
+ # s - Deleted; heading split into two or more headings
36
+ # x - Deleted; heading replaced by another heading
37
+ #
38
+ # 06 - Type of record
39
+ # z - Authority data
40
+ #
41
+ # 07-08 Undefined character positions
42
+ #
43
+ # 09 - Character coding scheme
44
+ # # - MARC-8
45
+ # a - UCS/Unicode
46
+ #
47
+ # 10 Indicator count
48
+ # 2 - Number of character positions used for indicators
49
+ #
50
+ # 11 Subfield code count
51
+ # 2 - Number of character positions used for a subfield code
52
+ #
53
+ # 12-16 Base address of data
54
+ # [number] - Length of Leader and Directory
55
+ #
56
+ # 17 - Encoding level
57
+ # n - Complete authority record
58
+ # o - Incomplete authority record
59
+ #
60
+ # 20-23 Entry map
61
+ #
62
+ # 18-19 - Undefined character positions
63
+ #
64
+ # 20 - Length of the length-of-field portion
65
+ # 4 - Number of characters in the length-of-field portion of a Directory entry
66
+ #
67
+ # 21 - Length of the starting-character-position portion
68
+ # 5 - Number of characters in the starting-character-position portion of a Directory entry
69
+ #
70
+ # 22 - Length of the implementation-defined portion
71
+ # 0 - Number of characters in the implementation-defined portion of a Directory entry
72
+ #
73
+ # It is common for default values in other Leader elements to be generated automatically as well.
74
+ # Capitalization - Alphabetic codes are input as lower case letters.
75
+ #
76
+ # example:
77
+ #record.leader
78
+ #=> "00774cz a2200253n 4500"
79
+ # 00-04: '00774' - record length
80
+ # 05: 'c' - corrected or revised
81
+ # 06: 'z' - always 'z' for authority records
82
+ # 09: 'a' - UCS/Unicode
83
+ # 12-16: '00253' - base address of data, Length of Leader and Directory
84
+ # 17: 'n' - Complete authority record
85
+ LEADER_STATUS_CODES = {
86
+ 'a' => 'Increase in encoding level',
87
+ 'c' => 'Corrected or revised',
88
+ 'd' => 'Deleted',
89
+ 'n' => 'New',
90
+ 'o' => 'Obsolete',
91
+ 's' => 'Deleted; heading split into two or more headings',
92
+ 'x' => 'Deleted; heading replaced by another heading'
93
+ }
94
+ def leader_parse(record)
95
+ leader = {
96
+ :length => record.leader[0..4].to_i,
97
+ :status => record.leader[5], # LEADER_STATUS_CODES[ record.leader[5] ]
98
+ :type => record.leader[6], # always 'z' for authority records
99
+ :encoding => record.leader[9], # TODO: translate letter code into ruby encoding string
100
+ :data_address => record.leader[12..16].to_i,
101
+ :complete => record.leader[17].include?('n')
102
+ }
103
+ end
104
+
105
+
106
+ # Stanford Resource keys and Ckeys can collide. They are only unique within their own set.
107
+ #
108
+ # When I do a catalogdump for ckey 6809804 I see:
109
+ #
110
+ # .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
111
+ #
112
+ # When we do a catalogdump for searchworks we filter the results to only export
113
+ # records with holdings, and not those things which are on order or "shadowed"
114
+ # i.e. hidden from public view, although we still have the bibliographic data in
115
+ # the database. When I extracted the records for conversion I selected all of
116
+ # them.
117
+ #
118
+ # - Josh
119
+
120
+
121
+ # Create SUL LOD...
122
+ SUL_URI = RDF::URI.new('http://linked-data.stanford.edu/library/')
123
+
124
+ # extract catalog key from field 001 (use the first one)
125
+ field001 = record.fields.select {|f| f if f.tag == '001' }.first
126
+ cat_key = field001.value.strip
127
+ CAT_URI = SUL_URI.join("catalog/#{cat_key}")
128
+
129
+ # TODO: Evaluate whether cat_key is in SearchWorks, e.g.
130
+ # "http://searchworks.stanford.edu/catalog/#{cat_key}"
131
+ # http://searchworks.stanford.edu/catalog/7106054
132
+
133
+ # TODO: extract 035a for OCLC master control number.
134
+ # TODO: map the OCLC to the OCLC work number.
135
+ field035 = record.fields.select {|f| f if f.tag == '035' }
136
+
137
+
138
+ binding.pry
139
+ exit!
140
+
141
+
142
+ #There is nothing in the MARC record itself to indicate that a holding is
143
+ #'shadowed' (not available for public view), but one idea to handle them is
144
+ #to supply a list of shadowed ckeys and that list could easily be transformed
145
+ #
146
+ #into a list of triples like this:
147
+ # <http://linked-data.stanford.edu/library/catalog/{cKey}> <rdf:Property> <http://linked-data.stanford.edu/library/catalog/isShadowed>
148
+ #...or what ever predicate and object you want to use. Then you can load those into the triple store.
149
+
150
+ # .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
151
+ # When we do a catalogdump for searchworks we filter the results to only export
152
+ # records with holdings, and not those things which are on order or "shadowed"
153
+ # i.e. hidden from public view, although we still have the bibliographic data in
154
+ # the database. When I extracted the records for conversion I selected all of
155
+ # them.
156
+ field948 = record.fields.select {|f| f if f.tag == '948' }
157
+ holdings = field948.first.value
158
+
159
+
160
+ # TODO: construct RDF model, see http://blog.datagraph.org/2010/03/rdf-for-ruby
161
+ # RDF::Literal.new("Hello!", :language => :en)
162
+ #
163
+ lod = {
164
+ :id => cat_uri,
165
+ :oclc => oclc4loc.collect {|uri| RDF::URI.new(uri) },
166
+ }
167
+
168
+ binding.pry
169
+ exit!
170
+
171
+ #for record in reader
172
+ # # print out field 245 subfield a
173
+ # puts record['245']['a']
174
+ #end
175
+