marc2linkeddata 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'marc2linkeddata'
4
+
5
+ CONFIG = Marc2LinkedData.configuration
6
+
7
+ def marc_auth_count(marc_file)
8
+ auth_records = 0
9
+ until marc_file.eof?
10
+ begin
11
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
12
+ marc_file.seek(leader[:length], IO::SEEK_CUR)
13
+ auth_records += 1 if leader[:type] == 'z'
14
+ rescue => e
15
+ puts
16
+ puts 'ERROR'
17
+ puts e.message
18
+ puts e.backtrace
19
+ puts
20
+ binding.pry if CONFIG.debug
21
+ end
22
+ end
23
+ marc_file.seek(0, IO::SEEK_SET)
24
+ auth_records
25
+ end
26
+
27
+ def marc2ld(marc_filename)
28
+ ld_filename = marc_filename.gsub('.mrc','.ttl')
29
+ puts "Translating: #{marc_filename} to #{ld_filename}"
30
+ ld_file = File.open(ld_filename,'w')
31
+ # Marc2LinkedData.write_prefixes(ld_file)
32
+ marc_file = File.open(marc_filename,'r')
33
+ auth_count = 0
34
+ auth_records = marc_auth_count(marc_file)
35
+ progress = ProgressBar.create(:total => auth_records, :format => '%a %f |%b>>%i| %P%% %t')
36
+ until marc_file.eof?
37
+ begin
38
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
39
+ raw = marc_file.read(leader[:length])
40
+ if leader[:type] == 'z'
41
+ progress.increment
42
+ record = MARC::Reader.decode(raw)
43
+ # ParseMarcAuthority is a lazy parser, so
44
+ # init only assigns record to an instance var.
45
+ auth = Marc2LinkedData::ParseMarcAuthority.new(record)
46
+ auth_count += 1
47
+ # auth_id = "auth:#{auth.get_id}"
48
+ # triples = nil
49
+ # # TODO: enable additional persistence options
50
+ # # Use data already in redis (if enabled)
51
+ # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
52
+ # if triples.nil?
53
+ # triples = auth.to_ttl # generate new triples
54
+ # # Update redis (if enabled) for triples not read from redis
55
+ # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
56
+ # end
57
+
58
+ triples = auth.to_ttl.lines
59
+ binding.pry if (CONFIG.debug && triples.empty?)
60
+ triples.delete_if {|l| l.chomp.empty? }
61
+ triples.delete_if {|l| l.start_with?('@prefix') } if auth_count > 1
62
+ ld_file.write(triples.join)
63
+ ld_file.flush
64
+ end
65
+ rescue => e
66
+ puts
67
+ puts 'ERROR'
68
+ puts e.message
69
+ puts e.backtrace
70
+ puts record.to_s
71
+ puts
72
+ binding.pry if CONFIG.debug
73
+ end
74
+ end
75
+ marc_file.close
76
+ ld_file.flush
77
+ ld_file.close
78
+ end
79
+
80
+ marc_files = []
81
+ ARGV.each do |filename|
82
+ path = Pathname(filename)
83
+ marc_files.push(path) if path.exist?
84
+ end
85
+ if marc_files.empty?
86
+ puts <<HELP
87
+ #{__FILE__} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
88
+
89
+ Output is RDF triples in a turtle file (.ttl) for every input .mrc file.
90
+ Optional persistence services can be controlled by environment variables.
91
+
92
+ Redis Persistence - based on https://github.com/redis/redis-rb
93
+ - essential options:
94
+ export REDIS4MARC=true # enable redis persistence (default = false)
95
+ - supplementary options:
96
+ Set the REDIS_URL for a custom redis configuration.
97
+ export REDIS_URL="redis://{user}:{password}@{host}:{port}/{db}"
98
+ export REDIS_READ=true # enable redis reads (default = REDIS4MARC || false)
99
+ # faster reading of triples from pre-populated redis data
100
+ export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
101
+ # recent data is updated in redis
102
+
103
+ HELP
104
+ else
105
+ end
106
+
107
+ puts "Logging to: #{CONFIG.log_file}"
108
+ marc_files.each do |path|
109
+ CONFIG.logger.info "Processing: #{path}"
110
+ marc2ld(path.to_s)
111
+ end
112
+
113
+
@@ -0,0 +1,146 @@
1
+
2
+ module Marc2LinkedData
3
+
4
+ class Configuration
5
+
6
+ attr_accessor :debug
7
+
8
+ attr_accessor :field_auth_loc
9
+ attr_accessor :field_auth_isni
10
+ attr_accessor :field_auth_oclc
11
+ attr_accessor :field_auth_viaf
12
+
13
+ attr_accessor :get_isni
14
+ attr_accessor :get_loc
15
+ attr_accessor :get_oclc
16
+ attr_accessor :get_viaf
17
+ attr_accessor :oclc_auth2works
18
+
19
+ attr_accessor :local_loc_user
20
+ attr_accessor :local_loc_pass
21
+ attr_accessor :local_loc_url
22
+
23
+ attr_accessor :prefixes
24
+
25
+ attr_accessor :use_foaf
26
+ attr_accessor :use_schema
27
+
28
+ attr_accessor :redis4marc
29
+ attr_accessor :redis_read
30
+ attr_accessor :redis_write
31
+ attr_accessor :redis
32
+
33
+ attr_accessor :log_file
34
+ attr_accessor :logger
35
+
36
+ def initialize
37
+ @debug = env_boolean('DEBUG')
38
+
39
+ # logging
40
+ log_file = ENV['LOG_FILE'] || 'marc2ld.log'
41
+ log_file = File.absolute_path log_file
42
+ @log_file = log_file
43
+ log_path = File.dirname log_file
44
+ unless File.directory? log_path
45
+ # try to create the log directory
46
+ Dir.mkdir log_path rescue nil
47
+ end
48
+ begin
49
+ log_file = File.new(@log_file, 'w+')
50
+ rescue
51
+ log_file = $stderr
52
+ @log_file = 'STDERR'
53
+ end
54
+ @logger = Logger.new(log_file, shift_age = 'monthly')
55
+ @logger.level = @debug ? Logger::DEBUG : Logger::INFO
56
+
57
+ # RDF prefixes
58
+ @prefixes = {}
59
+ # Library specific prefixes (use .env file or set shell ENV)
60
+ @prefixes['lib'] = ENV['LIB_PREFIX'] || 'http://linked-data.stanford.edu/library/'
61
+ @prefixes['lib_auth'] = "#{prefixes['lib']}authority/"
62
+ @prefixes['lib_cat'] = "#{prefixes['lib']}catalog/"
63
+ # Static Prefixes
64
+ @prefixes['bf'] = 'http://bibframe.org/vocab/'
65
+ @prefixes['foaf'] = 'http://xmlns.com/foaf/0.1/'
66
+ @prefixes['isni'] = 'http://www.isni.org/isni/'
67
+ @prefixes['loc_names'] = 'http://id.loc.gov/authorities/names/'
68
+ @prefixes['loc_subjects'] = 'http://id.loc.gov/authorities/subjects/'
69
+ @prefixes['rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
70
+ @prefixes['rdfs'] = 'http://www.w3.org/2000/01/rdf-schema#'
71
+ @prefixes['schema'] = 'http://schema.org/'
72
+ @prefixes['owl'] = 'http://www.w3.org/2002/07/owl#'
73
+ @prefixes['viaf'] = 'http://viaf.org/viaf/'
74
+
75
+ # Authority parse options
76
+ @field_auth_loc = ENV['FIELD_AUTH_LOC']
77
+ @field_auth_isni = ENV['FIELD_AUTH_ISNI']
78
+ @field_auth_oclc = ENV['FIELD_AUTH_OCLC']
79
+ @field_auth_viaf = ENV['FIELD_AUTH_VIAF']
80
+
81
+ @get_isni = env_boolean('GET_ISNI')
82
+ @get_loc = env_boolean('GET_LOC')
83
+ @get_viaf = env_boolean('GET_VIAF')
84
+ @get_oclc = env_boolean('GET_OCLC')
85
+ @oclc_auth2works = env_boolean('OCLC_AUTH2WORKS')
86
+
87
+ # Vocabulary options
88
+ # foaf:Person or schema:Person or both?
89
+ @use_foaf = env_boolean('USE_FOAF')
90
+ @use_schema = env_boolean('USE_SCHEMA') # schema.org
91
+
92
+ # Local triple store for LOC authority data,
93
+ # accessed via an HTTP API with basic authentication.
94
+ # See downloads at http://id.loc.gov/download/
95
+ @local_loc_user = ENV['LOCAL_LOC_USER']
96
+ @local_loc_pass = ENV['LOCAL_LOC_PASS']
97
+ loc_host = ENV['LOCAL_LOC_HOST']
98
+ loc_port = ENV['LOCAL_LOC_PORT']
99
+ loc_path = ENV['LOCAL_LOC_PATH']
100
+ @local_loc_url = "http://#{loc_host}:#{loc_port}#{loc_path}"
101
+
102
+ # Persistence options
103
+ @redis = nil
104
+ @redis4marc = env_boolean('REDIS4MARC')
105
+ if @redis4marc
106
+ @redis_url = env_boolean('REDIS_URL')
107
+ @redis_read = env_boolean('REDIS_READ')
108
+ @redis_write = env_boolean('REDIS_WRITE')
109
+ redis_config
110
+ else
111
+ @redis_url = nil
112
+ @redis_read = false
113
+ @redis_write = false
114
+ end
115
+ # TODO: provide options for triple stores
116
+ end
117
+
118
+ def env_boolean(var)
119
+ # check if an ENV variable is true, use false as default
120
+ ENV[var].to_s.upcase == 'TRUE' rescue false
121
+ end
122
+
123
+ def redis_config
124
+ if @redis4marc
125
+ # https://github.com/redis/redis-rb
126
+ # storing objects in redis:
127
+ #redis.set "foo", [1, 2, 3].to_json
128
+ #JSON.parse(redis.get("foo"))
129
+ require 'hiredis'
130
+ require 'redis'
131
+ if @redis_url
132
+ # redis url should be of the form "redis://{user}:{password}@{host}:{port}/{db}"
133
+ @redis = Redis.new(:url => @redis_url)
134
+ @redis.ping
135
+ else
136
+ # default is 'redis://127.0.0.1:6379/0'
137
+ @redis = Redis.new
138
+ @redis.ping
139
+ end
140
+ end
141
+ end
142
+
143
+ end
144
+
145
+ end
146
+
@@ -0,0 +1,23 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class Isni < Resource
6
+
7
+ # Interesting slide presentation about ISNI
8
+ # http://www.slideshare.net/JaniferGatenby/viaf-and-isni-ifla-2014-0815
9
+
10
+ PREFIX = 'http://www.isni.org/isni/'
11
+
12
+ def rdf
13
+ # e.g. 'http://www.isni.org/isni/0000000109311081'
14
+ return nil if @iri.nil?
15
+ return @rdf unless @rdf.nil?
16
+ uri4rdf = @iri.to_s + '.rdf'
17
+ @rdf = get_rdf(uri4rdf)
18
+ end
19
+
20
+ end
21
+
22
+ end
23
+
@@ -0,0 +1,17 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class LibAuth < Resource
6
+
7
+ # def rdf
8
+ # return nil if @iri.nil?
9
+ # return @rdf unless @rdf.nil?
10
+ # uri4rdf = @iri.to_s + '.rdf'
11
+ # @rdf = get_rdf(uri4rdf)
12
+ # end
13
+
14
+ end
15
+
16
+ end
17
+
@@ -0,0 +1,91 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class Loc < Resource
6
+
7
+ PREFIX = 'http://id.loc.gov/authorities/'
8
+ PREFIX_NAMES = "#{PREFIX}names/"
9
+ PREFIX_SUBJECTS = "#{PREFIX}subjects/"
10
+
11
+ # def id
12
+ # return nil if @iri.nil?
13
+ # @id ||= @iri.basename
14
+ # # Could get id from rdf, but that incurs costs for RDF retrieval and parsing etc.
15
+ # #oclc_id = '<identifiers:oclcnum>oca04921729</identifiers:oclcnum>'
16
+ # #<identifiers:lccn>no 99010609</identifiers:lccn>
17
+ # #<identifiers:oclcnum>oca04921729</identifiers:oclcnum>
18
+ # end
19
+
20
+ def rdf
21
+ return nil if iri.nil?
22
+ return @rdf unless @rdf.nil?
23
+ uri4rdf = iri.to_s + '.rdf'
24
+ @rdf = get_rdf(uri4rdf)
25
+ end
26
+
27
+ def label
28
+ label_predicate = '<http://www.loc.gov/mads/rdf/v1#authoritativeLabel>'
29
+ query = SPARQL.parse("SELECT * WHERE { <#{@iri}> #{label_predicate} ?o }")
30
+ rdf.query(query).first[:o].to_s rescue nil
31
+ end
32
+
33
+ def authority?
34
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#Authority' }.length > 0
35
+ end
36
+
37
+ def deprecated?
38
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#DeprecatedAuthority' }.length > 0
39
+ end
40
+
41
+ def conference?
42
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#ConferenceName' }.length > 0
43
+ end
44
+
45
+ def corporation?
46
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#CorporateName' }.length > 0
47
+ end
48
+
49
+ def name_title?
50
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#NameTitle' }.length > 0
51
+ end
52
+
53
+ def person?
54
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#PersonalName' }.length > 0
55
+ # iri_types.filter {|s| s[:o] =~ /PersonalName/ }.length > 0
56
+ # obj = rdf_find_object 'PersonalName'
57
+ # obj.nil? ? false : true
58
+ end
59
+
60
+ def place?
61
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#Geographic' }.length > 0
62
+ end
63
+
64
+ def get_oclc_identity
65
+ # Try to get OCLC URI from LOC ID
66
+ # http://oclc.org/developer/develop/web-services/worldcat-identities.en.html
67
+ # e.g. http://www.worldcat.org/identities/lccn-n79044803/
68
+ # e.g. http://www.worldcat.org/identities/lccn-n79044798/
69
+ return @oclc_iri unless @oclc_iri.nil?
70
+ oclc_url = URI.encode('http://www.worldcat.org/identities/lccn-' + id + '/')
71
+ @oclc_iri = resolve_external_auth(oclc_url)
72
+ # TODO: OCLC might redirect and then provide a 'fast' URI for obsolete identity records.
73
+ end
74
+
75
+ def get_viaf
76
+ return @viaf_iri unless @viaf_iri.nil?
77
+ # Try to get VIAF from LOC sourceID
78
+ # LOC statement with VIAF URI, e.g.:
79
+ # s: <http://id.loc.gov/authorities/names/n79046291>
80
+ # p: <http://www.loc.gov/mads/rdf/v1#hasExactExternalAuthority>
81
+ # o: <http://viaf.org/viaf/sourceID/LC%7Cn+79046291#skos:Concept> .
82
+ #return nil unless rdf_valid?
83
+ #@viaf_iri ||= rdf_find_object 'viaf'
84
+ viaf_url = URI.encode('http://viaf.org/viaf/sourceID/LC|' + id + '#skos:Concept')
85
+ @viaf_iri = resolve_external_auth(viaf_url)
86
+ end
87
+
88
+ end
89
+
90
+ end
91
+
@@ -0,0 +1,44 @@
1
+ require_relative 'oclc_resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcCreativeWork < OclcResource
6
+
7
+ PREFIX = 'http://www.worldcat.org/oclc/'
8
+
9
+ def get_works
10
+ # assume an exampleOfWork can only ever link to one work?
11
+ q = query_work(@iri)
12
+ works = rdf.query(q).collect {|s| s[:o] }
13
+ if works.empty?
14
+ # OCLC data is inconsistent in use of 'www.' in IRI, so try again.
15
+ # The OclcResource coerces @iri so it includes 'www.', so try without it.
16
+ uri = @iri.to_s.gsub('www.','')
17
+ q = query_work(uri)
18
+ works = rdf.query(q).collect {|s| s[:o] }
19
+ end
20
+ if works.empty?
21
+ # Keep the 'www.', cast the ID to an integer.
22
+ uri = @iri.to_s.gsub(id, id.to_i.to_s)
23
+ q = query_work(uri)
24
+ works = rdf.query(q).collect {|s| s[:o] }
25
+ end
26
+ if works.empty?
27
+ # Remove the 'www.' AND cast the ID to an integer.
28
+ uri = @iri.to_s.gsub('www.','').gsub(id, id.to_i.to_s)
29
+ q = query_work(uri)
30
+ works = rdf.query(q).collect {|s| s[:o] }
31
+ end
32
+ works
33
+ end
34
+
35
+ def query_work(uri)
36
+ SPARQL.parse("SELECT * WHERE { <#{uri}> <http://schema.org/exampleOfWork> ?o }")
37
+ end
38
+
39
+ # TODO: get ISBN?
40
+
41
+ end
42
+
43
+ end
44
+
@@ -0,0 +1,46 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcIdentity < Resource
6
+
7
+ PREFIX = 'http://www.worldcat.org/identities/'
8
+
9
+ def rdf
10
+ # e.g. 'http://www.worldcat.org/identities/lccn-n79044803/'
11
+ # the html returned contains RDFa data
12
+ return nil if @iri.nil?
13
+ return @rdf unless @rdf.nil?
14
+ uri4rdf = @iri.to_s
15
+ uri4rdf += '/' unless uri4rdf.end_with? '/'
16
+ @rdf = get_rdf(uri4rdf)
17
+ end
18
+
19
+ # def get_xml
20
+ # begin
21
+ # return @xml unless @xml.nil?
22
+ # http = Net::HTTP.new @iri.host
23
+ # resp = http.get(@iri.path, {'Accept' => 'application/xml'})
24
+ # case resp.code
25
+ # when '301','302','303'
26
+ # #301 Moved Permanently; 302 Moved Temporarily; 303 See Other
27
+ # resp = http.get(resp['location'], {'Accept' => 'application/xml'})
28
+ # end
29
+ # if resp.code != '200'
30
+ # raise
31
+ # end
32
+ # @xml = resp.body
33
+ # rescue
34
+ # puts 'ERROR: Failed to request OCLC identity xml.'
35
+ # end
36
+ # end
37
+
38
+ def creative_works
39
+ q = SPARQL.parse('SELECT * WHERE { ?oclcWork a <http://schema.org/CreativeWork> }')
40
+ rdf.query(q).collect {|s| s[:oclcWork] }
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+
@@ -0,0 +1,79 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcResource < Resource
6
+
7
+ PREFIX = 'http://www.worldcat.org/oclc/'
8
+
9
+ def initialize(uri=nil)
10
+ # Ensure the OCLC IRI contains 'www' in the host name.
11
+ unless uri =~ /www\./
12
+ uri = uri.to_s.gsub('worldcat.org','www.worldcat.org')
13
+ end
14
+ super(uri)
15
+ end
16
+
17
+ def rdf
18
+ # e.g. 'http://worldcat.org/oclc/004957186'
19
+ # also 'http://www.worldcat.org/oclc/004957186'
20
+ return nil if @iri.nil?
21
+ return @rdf unless @rdf.nil?
22
+ uri4rdf = @iri.to_s
23
+ uri4rdf += '.rdf' unless uri4rdf.end_with? '.rdf'
24
+ @rdf = get_rdf(uri4rdf)
25
+ end
26
+
27
+ def book?
28
+ iri_types.filter {|s| s[:o] == 'http://schema.org/Book' }.length > 0
29
+ end
30
+
31
+ def creator?(uri)
32
+ creators.include? RDF::URI.new(uri)
33
+ end
34
+
35
+ def contributor?(uri)
36
+ contributors.include? RDF::URI.new(uri)
37
+ end
38
+
39
+ def editor?(uri)
40
+ editors.include? RDF::URI.new(uri)
41
+ end
42
+
43
+ def media_object?
44
+ iri_types.filter {|s| s[:o] == 'http://schema.org/MediaObject' }.length > 0
45
+ end
46
+
47
+ def about
48
+ q = SPARQL.parse('SELECT * WHERE { ?s <http://schema.org/about> ?o }')
49
+ rdf.query(q)
50
+ end
51
+
52
+ def creators
53
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/creator> ?o }")
54
+ rdf.query(q).collect {|s| s[:o] }
55
+ end
56
+
57
+ def contributors
58
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/contributor> ?o }")
59
+ rdf.query(q).collect {|s| s[:o] }
60
+ end
61
+
62
+ def editors
63
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/editor> ?o }")
64
+ rdf.query(q).collect {|s| s[:o] }
65
+ end
66
+
67
+ def publishers
68
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/publisher> ?o }")
69
+ rdf.query(q).collect {|s| s[:o] }
70
+ end
71
+
72
+ def isbns
73
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/isbn> ?o }")
74
+ rdf.query(q).collect {|s| s[:o] }
75
+ end
76
+ end
77
+
78
+ end
79
+
@@ -0,0 +1,19 @@
1
+ require_relative 'oclc_resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcWork < OclcResource
6
+
7
+ # OCLC is inconsistent with use of 'www' in URIs
8
+ #PREFIX = 'http://www.worldcat.org/entity/work/id/'
9
+ PREFIX = 'http://worldcat.org/entity/work/id/'
10
+
11
+ def example_works
12
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/workExample> ?o }")
13
+ rdf.query(q).collect {|s| s[:o] }
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+