marc2linkeddata 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,492 @@
1
+
2
+ # Marc21 Authority fields are documented at
3
+ # http://www.loc.gov/marc/authority/ecadlist.html
4
+ # http://www.loc.gov/marc/authority/ecadhome.html
5
+
6
+ module Marc2LinkedData
7
+
8
+ class ParseMarcAuthority
9
+
10
+ # TODO: provide iterator pattern on an entire file of records.
11
+ # @leader = ParseMarcAuthority::parse_leader(marc_file)
12
+ # raw = marc_file.read(@leader[:length])
13
+ # @record = MARC::Reader.decode(raw)
14
+
15
+ @@config = nil
16
+
17
+ attr_reader :loc
18
+ attr_reader :isni
19
+ attr_reader :viaf
20
+
21
+ def initialize(record)
22
+ @@config ||= Marc2LinkedData.configuration
23
+ @record = record
24
+ @graph = RDF::Graph.new
25
+ @loc = nil
26
+ @isni = nil
27
+ @viaf = nil
28
+ end
29
+
30
+ def get_fields(field_num)
31
+ fields = @record.fields.select {|f| f if f.tag == field_num }
32
+ raise "Invalid data in field #{field_num}" if fields.length < 1
33
+ fields
34
+ end
35
+
36
+ # Try to use the SUL catkey and/or the OCLC control numbers, maybe SUL
37
+ # catkey in the record IRI
38
+ def get_id
39
+ # extract ID from control numbers, see
40
+ # http://www.loc.gov/marc/authority/ad001.html
41
+ #field001 = record.fields.select {|f| f if f.tag == '001' }.first.value
42
+ #field003 = record.fields.select {|f| f if f.tag == '003' }.first.value
43
+ #"#{field003}-#{field001}"
44
+ get_fields('001').first.value
45
+ end
46
+
47
+ def get_iri(field, iri_pattern)
48
+ begin
49
+ iris = field.subfields.collect {|f| f.value if f.value.include? iri_pattern }
50
+ iris.first || nil
51
+ rescue
52
+ nil
53
+ end
54
+ end
55
+
56
+ def get_iri4isni
57
+ isni_iri = nil
58
+ begin
59
+ # e.g. http://www.isni.org/0000000109311081
60
+ field = get_fields(@@config.field_auth_isni).first
61
+ isni_iri = get_iri(field, 'isni.org')
62
+ # If ISNI is not already in the MARC record, try to get it from VIAF.
63
+ if isni_iri.nil? && @@config.get_isni
64
+ isni_iri = @viaf.get_isni rescue nil
65
+ @@config.logger.debug 'Failed to resolve ISNI URI' if isni_iri.nil?
66
+ # binding.pry if @viaf.iri.to_s.include? '67737121' #@@config.debug
67
+ end
68
+ unless isni_iri.nil?
69
+ # Ensure the ISNI IRI has this prefix: http://www.isni.org/isni/
70
+ isni_iri.gsub('www.isni.org', 'www.isni.org/isni') unless isni_iri =~ /www\.isni\.org\/isni\//
71
+ end
72
+ return isni_iri
73
+ rescue
74
+ nil
75
+ end
76
+ end
77
+
78
+ def get_iri4lib
79
+ "#{@@config.prefixes['lib_auth']}#{get_id}"
80
+ end
81
+
82
+ def get_iri4loc
83
+ loc_iri = nil
84
+ begin
85
+ # e.g. http://id.loc.gov/authorities/names/n42000906
86
+ field = get_fields(@@config.field_auth_loc).first
87
+ loc_iri = get_iri(field, 'id.loc.gov')
88
+ rescue
89
+ end
90
+ begin
91
+ if loc_iri.nil?
92
+ # If the LOC is not in the marc record, try to determine the LOC IRI from the ID.
93
+ loc_id = get_id
94
+ if loc_id =~ /^n/i
95
+ loc_iri = "#{@@config.prefixes['loc_names']}#{loc_id.downcase}"
96
+ end
97
+ if loc_id =~ /^sh/i
98
+ loc_iri = "#{@@config.prefixes['loc_subjects']}#{loc_id.downcase}"
99
+ end
100
+ unless loc_iri.nil?
101
+ # Verify the URL (used HEAD so it's as fast as possible)
102
+ @@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
103
+ res = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
104
+ case res.code
105
+ when '200'
106
+ # it's good to go
107
+ when '301'
108
+ # use the redirection
109
+ loc_iri = res['location']
110
+ when '302','303'
111
+ #302 Moved Temporarily
112
+ #303 See Other
113
+ # Use the current URL, most get requests will follow a 302 or 303
114
+ else
115
+ loc_iri = nil
116
+ end
117
+ end
118
+ if loc_iri.nil?
119
+ # If it gets here, it's a problem.
120
+ binding.pry if @@config.debug
121
+ @@config.logger.error 'FAILURE to resolve LOC IRI'
122
+ else
123
+ @@config.logger.debug "DISCOVERED LOC IRI: #{loc_iri}"
124
+ end
125
+ else
126
+ @@config.logger.debug "MARC contains LOC IRI: #{loc_iri}"
127
+ end
128
+ return loc_iri
129
+ rescue
130
+ nil
131
+ end
132
+ end
133
+
134
+ def get_iri4oclc
135
+ begin
136
+ field = get_fields(@@config.field_auth_oclc).first
137
+ oclc_cn = field.subfields.collect {|f| f.value if f.code == 'a'}.first
138
+ oclc_id = /\d+$/.match(oclc_cn).to_s
139
+ oclc_id.empty? ? nil : "http://www.worldcat.org/oclc/#{oclc_id}"
140
+ rescue
141
+ nil
142
+ end
143
+ end
144
+
145
+ def get_iri4viaf
146
+ begin
147
+ # e.g. http://viaf.org/viaf/181829329
148
+ # VIAF RSS feed for changes, e.g. http://viaf.org/viaf/181829329.rss
149
+ field = get_fields(@@config.field_auth_viaf).first
150
+ viaf_iri = get_iri(field, 'viaf.org')
151
+ # If VIAF is not already in the MARC record, try to get from LOC.
152
+ if viaf_iri.nil? && @@config.get_viaf
153
+ viaf_iri = @loc.get_viaf rescue nil
154
+ @@config.logger.debug 'Failed to resolve VIAF URI' if viaf_iri.nil?
155
+ end
156
+ return viaf_iri
157
+ rescue
158
+ nil
159
+ end
160
+ end
161
+
162
+ def self.parse_leader(file_handle, leader_bytes=24)
163
+ # example:
164
+ #record.leader
165
+ #=> "00774cz a2200253n 4500"
166
+ # 00-04: '00774' - record length
167
+ # 05: 'c' - corrected or revised
168
+ # 06: 'z' - always 'z' for authority records
169
+ # 09: 'a' - UCS/Unicode
170
+ # 12-16: '00253' - base address of data, Length of Leader and Directory
171
+ # 17: 'n' - Complete authority record
172
+ # leader_status_codes = {
173
+ # 'a' => 'Increase in encoding level',
174
+ # 'c' => 'Corrected or revised',
175
+ # 'd' => 'Deleted',
176
+ # 'n' => 'New',
177
+ # 'o' => 'Obsolete',
178
+ # 's' => 'Deleted; heading split into two or more headings',
179
+ # 'x' => 'Deleted; heading replaced by another heading'
180
+ # }
181
+ leader = file_handle.read(leader_bytes)
182
+ file_handle.seek(-1 * leader_bytes, IO::SEEK_CUR)
183
+ {
184
+ :length => leader[0..4].to_i,
185
+ :status => leader[5], # leader_status_codes[ record.leader[5] ]
186
+ :type => leader[6], # always 'z' for authority records
187
+ :encoding => leader[9], # translate letter code into ruby encoding string
188
+ :data_address => leader[12..16].to_i,
189
+ :complete => leader[17].include?('n')
190
+ }
191
+ end
192
+
193
+ def parse_008
194
+ # http://www.loc.gov/marc/authority/concise/ad008.html
195
+ field = get_fields('008').first
196
+ field008 = field.value
197
+ languages = []
198
+ languages.append('English') if ['b','e'].include? field008[8]
199
+ languages.append('French') if ['b','f'].include? field008[8]
200
+ rules = ''
201
+ rules = 'EARLIER' if field008[10] == 'a'
202
+ rules = 'AACR1' if field008[10] == 'b'
203
+ rules = 'AACR2' if field008[10] == 'c'
204
+ rules = 'AACR2 compatible' if field008[10] == 'd'
205
+ rules = 'OTHER' if field008[10] == 'z'
206
+ rules = 'N/A' if field008[10] == 'n'
207
+ # 32 - Undifferentiated personal name
208
+ # Whether the personal name in a name or name/title heading contained in field 100 in an established heading record or a reference record is used by one person or by two or more persons.
209
+ # a - Differentiated personal name
210
+ # Personal name in field 100 is a unique name.
211
+ # b - Undifferentiated personal name
212
+ # Personal name in field 100 is used by two or more persons.
213
+ # n - Not applicable
214
+ # 1XX heading is not a personal name or the personal name is a family name.
215
+ # | - No attempt to code
216
+ {
217
+ :date => Date.strptime(field008[0..5], "%y%m%d"),
218
+ :geographic_subdivision => field008[6], # '#', d, i, n, or '|'
219
+ :romanization_scheme => field008[7], # a..g, n, or '|'
220
+ :languages => languages,
221
+ :kind => field008[9], # a..g, or '|'
222
+ :rules => rules,
223
+ :heading_system => field008[11],
224
+ :series_type => field008[12],
225
+ :series_numbered => field008[13],
226
+ :use_1XX_for_7XX => field008[14] == 'a',
227
+ :use_1XX_for_6XX => field008[15] == 'a',
228
+ :use_1XX_for_4XX => field008[16] == 'a',
229
+ :use_1XX_for_8XX => field008[16] == 'a',
230
+ :type_subject_subdivision => field008[17],
231
+ # 18-27 - Undefined character positions
232
+ :type_government_agency => field008[28],
233
+ :reference_evaluation => field008[29],
234
+ # 30 - Undefined character position
235
+ :record_available => field008[31] == 'a',
236
+ # TODO: 32
237
+ # TODO: 33
238
+ # 34-37 - Undefined character positions
239
+ # TODO: 38
240
+ # TODO: 39
241
+ }
242
+ end
243
+
244
+ def parse_100
245
+ # http://www.loc.gov/marc/authority/concise/ad100.html
246
+ begin
247
+ # 100 is a personal name
248
+ field = get_fields('100').first
249
+ # field = @record.fields.select {|f| f if f.tag == '100' }.first
250
+ name = field.subfields.select {|f| f.code == 'a' }.first.value rescue ''
251
+ name.force_encoding('UTF-8')
252
+ rescue
253
+ 'ERROR_PERSONAL_NAME'
254
+ end
255
+ end
256
+
257
+ def parse_110
258
+ # http://www.loc.gov/marc/authority/concise/ad110.html
259
+ begin
260
+ # 110 is a corporate name
261
+ field = @record.fields.select {|f| f if f.tag == '110' }.first
262
+ a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
263
+ b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
264
+ c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
265
+ name = [a,b,c].flatten.join(' : ')
266
+ name.force_encoding('UTF-8')
267
+ rescue
268
+ 'ERROR_CORPORATE_NAME'
269
+ end
270
+ end
271
+
272
+ def parse_111
273
+ # http://www.loc.gov/marc/authority/concise/ad111.html
274
+ begin
275
+ # 111 is a meeting name
276
+ field = @record.fields.select {|f| f if f.tag == '111' }.first
277
+ a = field.subfields.collect {|f| f.value if f.code == 'a' }.compact rescue []
278
+ # TODO: incorporate additional subfields?
279
+ # b = field.subfields.collect {|f| f.value if f.code == 'b' }.compact rescue []
280
+ # c = field.subfields.collect {|f| f.value if f.code == 'c' }.compact rescue []
281
+ # name = [a,b,c].flatten.join(' : ')
282
+ # name.force_encoding('UTF-8')
283
+ a.force_encoding('UTF-8')
284
+ rescue
285
+ 'ERROR_MEETING_NAME'
286
+ end
287
+ end
288
+
289
+ def parse_151
290
+ # http://www.loc.gov/marc/authority/concise/ad151.html
291
+ begin
292
+ # 151 is a geographic name
293
+ field = @record.fields.select {|f| f if f.tag == '151' }.first
294
+ name = field.subfields.collect {|f| f.value if f.code == 'a' }.first rescue ''
295
+ name.force_encoding('UTF-8')
296
+ rescue
297
+ 'ERROR_PLACE_NAME'
298
+ end
299
+ end
300
+
301
+ def parse_loc_auth_name
302
+ #
303
+ # Create triples for various kinds of LOC authority.
304
+ # At present, this relies on LOC RDF to differentiate
305
+ # types of authorities. It should be possible to do this
306
+ # from the MARC directly, if @@config.get_loc is false.
307
+ #
308
+ # The MARC data differentiates them according to the tag number.
309
+ # The term 'name' refers to:
310
+ # X00 - Personal Name
311
+ # X10 - Corporate Name
312
+ # X11 - Meeting Name
313
+ # X30 - Uniform Title
314
+ # X51 - Jurisdiction / Geographic Name
315
+ #
316
+ @@config.logger.warn "LOC URL: #{@loc.iri} DEPRECATED" if @loc.deprecated?
317
+ name = ''
318
+ if @loc.conference?
319
+ # e.g. http://id.loc.gov/authorities/names/n79044866
320
+ name = @loc.label || parse_111
321
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.event)
322
+ elsif @loc.corporation?
323
+ name = @loc.label || parse_110
324
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Organization) if @@config.use_foaf
325
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Organization) if @@config.use_schema
326
+ elsif @loc.name_title?
327
+ # e.g. http://id.loc.gov/authorities/names/n79044934
328
+ # Skipping these, because the person entity should be in
329
+ # an additional record and we don't want the title content.
330
+ binding.pry if @@config.debug
331
+ return ''
332
+ elsif @loc.person?
333
+ name = @loc.label || parse_100
334
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Person) if @@config.use_foaf
335
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Person) if @@config.use_schema
336
+ # VIAF extracts first and last name, try to use them. Note
337
+ # that VIAF uses schema:name, schema:givenName, and schema:familyName.
338
+ if @@config.get_viaf && ! @viaf.nil?
339
+ @viaf.family_names.each do |n|
340
+ # ln = URI.encode(n)
341
+ # TODO: try to get a language type, if VIAF provide it.
342
+ # name = RDF::Literal.new(n, :language => :en)
343
+ ln = RDF::Literal.new(n)
344
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.familyName, ln) if @@config.use_foaf
345
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.familyName, ln) if @@config.use_schema
346
+ end
347
+ @viaf.given_names.each do |n|
348
+ # fn = URI.encode(n)
349
+ # TODO: try to get a language type, if VIAF provide it.
350
+ # name = RDF::Literal.new(n, :language => :en)
351
+ fn = RDF::Literal.new(n)
352
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.firstName, fn) if @@config.use_foaf
353
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.givenName, fn) if @@config.use_schema
354
+ end
355
+ end
356
+ elsif @loc.place?
357
+ # e.g. http://id.loc.gov/authorities/names/n79045127
358
+ name = @loc.label || parse_151
359
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Place)
360
+ else
361
+ # TODO: find out what type this is.
362
+ binding.pry if @@config.debug
363
+ name = @loc.label || ''
364
+ # Note: schema.org has no immediate parent for Person or Organization
365
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::FOAF.Agent) if @@config.use_foaf
366
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF.type, RDF::SCHEMA.Thing) if @@config.use_schema
367
+ end
368
+ if name != ''
369
+ # name_encoding = URI.encode(name)
370
+ name = RDF::Literal.new(name)
371
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::FOAF.name, name) if @@config.use_foaf
372
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::SCHEMA.name, name) if @@config.use_schema
373
+ end
374
+ end
375
+
376
+ def parse_loc_auth_subject
377
+ # TODO: what to do with subjects?
378
+ binding.pry if @@config.debug
379
+ # The term 'subject' refers to:
380
+ # X30 - Uniform Titles
381
+ # X48 - Chronological Terms
382
+ # X50 - Topical Terms
383
+ # X51 - Geographic Names
384
+ # X55 - Genre/Form Terms
385
+ #
386
+ # The term 'subject subdivision' refers to:
387
+ # X80 - general subdivision terms
388
+ # X81 - geographic subdivision names
389
+ # X82 - chronological subdivision terms
390
+ # X85 - form subdivision terms
391
+ end
392
+
393
+ def get_oclc_links
394
+ oclc_iri = nil
395
+ begin
396
+ # Try to get OCLC using LOC ID.
397
+ oclc_iri = @loc.get_oclc_identity
398
+ rescue
399
+ # Try to get OCLC using 035a field data, but
400
+ # this is not as reliable/accurate as LOC.
401
+ oclc_iri = get_iri4oclc
402
+ end
403
+ unless oclc_iri.nil?
404
+ # Try to get additional data from OCLC, using the RDFa
405
+ # available in the OCLC identities pages.
406
+ oclc_auth = OclcIdentity.new oclc_iri
407
+ @graph.insert RDF::Statement(@loc.rdf_uri, RDF::OWL.sameAs, oclc_auth.rdf_uri)
408
+ oclc_auth.creative_works.each do |creative_work_uri|
409
+ # Notes on work-around for OCLC data inconsistency:
410
+ # RDFa for http://www.worldcat.org/identities/lccn-n79044798 contains:
411
+ # <http://worldcat.org/oclc/747413718> a <http://schema.org/CreativeWork> .
412
+ # However, the RDF for <http://worldcat.org/oclc/747413718> contains:
413
+ # <http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://worldcat.org/entity/work/id/994448191> .
414
+ # Note how the subject here is 'WWW.worldcat.org' instead of 'worldcat.org'.
415
+ #creative_work_iri = creative_work.to_s.gsub('worldcat.org','www.worldcat.org')
416
+ #creative_work_iri = creative_work_iri.gsub('wwwwww','www') # in case it gets added already by OCLC
417
+ creative_work = OclcCreativeWork.new creative_work_uri
418
+ @graph.insert RDF::Statement(oclc_auth.rdf_uri, RDF::RDFS.seeAlso, creative_work.rdf_uri)
419
+ if @@config.oclc_auth2works
420
+ # Try to use VIAF to relate auth to work as creator, contributor, editor, etc.
421
+ # Note that this requires additional RDF retrieval for each work (slower processing).
422
+ unless @viaf.nil?
423
+ if creative_work.creator? @viaf.iri
424
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.creator, oclc_auth.rdf_uri)
425
+ elsif creative_work.contributor? @viaf.iri
426
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.contributor, oclc_auth.rdf_uri)
427
+ elsif creative_work.editor? @viaf.iri
428
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.editor, oclc_auth.rdf_uri)
429
+ end
430
+ end
431
+ # TODO: Is auth the subject of the work (as in biography) or both (as in autobiography).
432
+ # binding.pry if @@config.debug
433
+ # binding.pry if creative_work.iri.to_s == 'http://www.worldcat.org/oclc/006626542'
434
+ # Try to find the generic work entity for this example work.
435
+ creative_work.get_works.each do |oclc_work_uri|
436
+ oclc_work = OclcWork.new oclc_work_uri
437
+ @graph.insert RDF::Statement(creative_work.rdf_uri, RDF::SCHEMA.exampleOfWork, oclc_work.rdf_uri)
438
+ end
439
+ end
440
+ end
441
+ end
442
+ end
443
+
444
+ # TODO: use an 'affiliation' entry, maybe 373? (optional field)
445
+
446
+ def to_ttl
447
+ graph.to_ttl
448
+ end
449
+
450
+ def graph
451
+ # TODO: figure out how to specify all the graph prefixes.
452
+ return @graph unless @graph.empty?
453
+ @lib = LibAuth.new get_iri4lib
454
+ # Try to find LOC, VIAF, and ISNI IRIs in the MARC record
455
+ @loc = Loc.new get_iri4loc rescue nil
456
+ # Try to identify problems in getting an LOC IRI.
457
+ binding.pry if (@@config.debug && @loc.nil?)
458
+ # might require LOC to get ISNI.
459
+ @viaf = Viaf.new get_iri4viaf rescue nil
460
+ # might require VIAF to get ISNI.
461
+ @isni = Isni.new get_iri4isni rescue nil
462
+
463
+ # TODO: ORCID? VIVO? VITRO? Stanford CAP?
464
+
465
+ # Get LOC control number and add catalog permalink? e.g.
466
+ # http://lccn.loc.gov/n79046291
467
+
468
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @loc.rdf_uri)
469
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @viaf.rdf_uri) unless @viaf.nil?
470
+ @graph.insert RDF::Statement(@lib.rdf_uri, RDF::OWL.sameAs, @isni.rdf_uri) unless @isni.nil?
471
+ return @graph unless @@config.get_loc
472
+
473
+ # TODO: find codes in the marc record to differentiate the authority into
474
+ # TODO: person, organization, event, etc. without getting LOC RDF.
475
+
476
+ if @loc.iri.to_s =~ /name/
477
+ parse_loc_auth_name
478
+ elsif @loc.iri.to_s =~ /subjects/
479
+ parse_loc_auth_subject
480
+ else
481
+ binding.pry if @@config.debug
482
+ end
483
+ # Optional elaboration of authority data with OCLC identity and works.
484
+ get_oclc_links if @@config.get_oclc
485
+
486
+ @@config.logger.info "Extracted #{@loc.id}"
487
+ @graph
488
+ end
489
+ end
490
+
491
+ end
492
+
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'marc'
4
+ require 'linkeddata'
5
+ require 'pry'
6
+
7
+ #EXAMPLE_RECORD_FILE='../marc/catalog/stf.00.mrc'
8
+ EXAMPLE_RECORD_FILE='../marc/catalog/stf.51.mrc'
9
+
10
+ ## reading records from a batch file
11
+ #reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "MARC-8")
12
+ #reader = MARC::Reader.new(EXAMPLE_RECORD_FILE, :external_encoding => "UTF-8", :validate_encoding => true)
13
+
14
+ #reader = MARC::ForgivingReader.new(EXAMPLE_RECORD_FILE)
15
+
16
+ handle = File.new(EXAMPLE_RECORD_FILE)
17
+ #=> #<File:marc/authority/stf_auth.00.mrc>
18
+ rec_length = handle.read(5).to_i
19
+ #=> 774
20
+ handle.rewind
21
+ raw = handle.read(rec_length)
22
+ record = MARC::Reader.decode(raw)
23
+
24
+ # From http://www.loc.gov/marc/authority/adleader.html
25
+ # System-Generated Elements - The following Leader elements are usually system generated:
26
+ #
27
+ # 00-04 Logical record length
28
+ #
29
+ # 05 - Record status:
30
+ # a - Increase in encoding level
31
+ # c - Corrected or revised
32
+ # d - Deleted
33
+ # n - New
34
+ # o - Obsolete
35
+ # s - Deleted; heading split into two or more headings
36
+ # x - Deleted; heading replaced by another heading
37
+ #
38
+ # 06 - Type of record
39
+ # z - Authority data
40
+ #
41
+ # 07-08 Undefined character positions
42
+ #
43
+ # 09 - Character coding scheme
44
+ # # - MARC-8
45
+ # a - UCS/Unicode
46
+ #
47
+ # 10 Indicator count
48
+ # 2 - Number of character positions used for indicators
49
+ #
50
+ # 11 Subfield code count
51
+ # 2 - Number of character positions used for a subfield code
52
+ #
53
+ # 12-16 Base address of data
54
+ # [number] - Length of Leader and Directory
55
+ #
56
+ # 17 - Encoding level
57
+ # n - Complete authority record
58
+ # o - Incomplete authority record
59
+ #
60
+ # 20-23 Entry map
61
+ #
62
+ # 18-19 - Undefined character positions
63
+ #
64
+ # 20 - Length of the length-of-field portion
65
+ # 4 - Number of characters in the length-of-field portion of a Directory entry
66
+ #
67
+ # 21 - Length of the starting-character-position portion
68
+ # 5 - Number of characters in the starting-character-position portion of a Directory entry
69
+ #
70
+ # 22 - Length of the implementation-defined portion
71
+ # 0 - Number of characters in the implementation-defined portion of a Directory entry
72
+ #
73
+ # It is common for default values in other Leader elements to be generated automatically as well.
74
+ # Capitalization - Alphabetic codes are input as lower case letters.
75
+ #
76
+ # example:
77
+ #record.leader
78
+ #=> "00774cz a2200253n 4500"
79
+ # 00-04: '00774' - record length
80
+ # 05: 'c' - corrected or revised
81
+ # 06: 'z' - always 'z' for authority records
82
+ # 09: 'a' - UCS/Unicode
83
+ # 12-16: '00253' - base address of data, Length of Leader and Directory
84
+ # 17: 'n' - Complete authority record
85
+ LEADER_STATUS_CODES = {
86
+ 'a' => 'Increase in encoding level',
87
+ 'c' => 'Corrected or revised',
88
+ 'd' => 'Deleted',
89
+ 'n' => 'New',
90
+ 'o' => 'Obsolete',
91
+ 's' => 'Deleted; heading split into two or more headings',
92
+ 'x' => 'Deleted; heading replaced by another heading'
93
+ }
94
+ def leader_parse(record)
95
+ leader = {
96
+ :length => record.leader[0..4].to_i,
97
+ :status => record.leader[5], # LEADER_STATUS_CODES[ record.leader[5] ]
98
+ :type => record.leader[6], # always 'z' for authority records
99
+ :encoding => record.leader[9], # TODO: translate letter code into ruby encoding string
100
+ :data_address => record.leader[12..16].to_i,
101
+ :complete => record.leader[17].include?('n')
102
+ }
103
+ end
104
+
105
+
106
+ # Stanford Resource keys and Ckeys can collide. They are only unique within their own set.
107
+ #
108
+ # When I do a catalogdump for ckey 6809804 I see:
109
+ #
110
+ # .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
111
+ #
112
+ # When we do a catalogdump for searchworks we filter the results to only export
113
+ # records with holdings, and not those things which are on order or "shadowed"
114
+ # i.e. hidden from public view, although we still have the bibliographic data in
115
+ # the database. When I extracted the records for conversion I selected all of
116
+ # them.
117
+ #
118
+ # - Josh
119
+
120
+
121
+ # Create SUL LOD...
122
+ SUL_URI = RDF::URI.new('http://linked-data.stanford.edu/library/')
123
+
124
+ # extract catalog key from field 001 (use the first one)
125
+ field001 = record.fields.select {|f| f if f.tag == '001' }.first
126
+ cat_key = field001.value.strip
127
+ CAT_URI = SUL_URI.join("catalog/#{cat_key}")
128
+
129
+ # TODO: Evaluate whether cat_key is in SearchWorks, e.g.
130
+ # "http://searchworks.stanford.edu/catalog/#{cat_key}"
131
+ # http://searchworks.stanford.edu/catalog/7106054
132
+
133
+ # TODO: extract 035a for OCLC master control number.
134
+ # TODO: map the OCLC to the OCLC work number.
135
+ field035 = record.fields.select {|f| f if f.tag == '035' }
136
+
137
+
138
+ binding.pry
139
+ exit!
140
+
141
+
142
+ #There is nothing in the MARC record itself to indicate that a holding is
143
+ #'shadowed' (not available for public view), but one idea to handle them is
144
+ #to supply a list of shadowed ckeys and that list could easily be transformed
145
+ #
146
+ #into a list of triples like this:
147
+ # <http://linked-data.stanford.edu/library/catalog/{cKey}> <rdf:Property> <http://linked-data.stanford.edu/library/catalog/isShadowed>
148
+ #...or what ever predicate and object you want to use. Then you can load those into the triple store.
149
+
150
+ # .948. |hNO HOLDINGS IN STF - 7 OTHER HOLDINGS
151
+ # When we do a catalogdump for searchworks we filter the results to only export
152
+ # records with holdings, and not those things which are on order or "shadowed"
153
+ # i.e. hidden from public view, although we still have the bibliographic data in
154
+ # the database. When I extracted the records for conversion I selected all of
155
+ # them.
156
+ field948 = record.fields.select {|f| f if f.tag == '948' }
157
+ holdings = field948.first.value
158
+
159
+
160
+ # TODO: construct RDF model, see http://blog.datagraph.org/2010/03/rdf-for-ruby
161
+ # RDF::Literal.new("Hello!", :language => :en)
162
+ #
163
+ lod = {
164
+ :id => cat_uri,
165
+ :oclc => oclc4loc.collect {|uri| RDF::URI.new(uri) },
166
+ }
167
+
168
+ binding.pry
169
+ exit!
170
+
171
+ #for record in reader
172
+ # # print out field 245 subfield a
173
+ # puts record['245']['a']
174
+ #end
175
+