marc2linkeddata 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'marc2linkeddata'
4
+
5
+ CONFIG = Marc2LinkedData.configuration
6
+
7
+ def marc_auth_count(marc_file)
8
+ auth_records = 0
9
+ until marc_file.eof?
10
+ begin
11
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
12
+ marc_file.seek(leader[:length], IO::SEEK_CUR)
13
+ auth_records += 1 if leader[:type] == 'z'
14
+ rescue => e
15
+ puts
16
+ puts 'ERROR'
17
+ puts e.message
18
+ puts e.backtrace
19
+ puts
20
+ binding.pry if CONFIG.debug
21
+ end
22
+ end
23
+ marc_file.seek(0, IO::SEEK_SET)
24
+ auth_records
25
+ end
26
+
27
+ def marc2ld(marc_filename)
28
+ ld_filename = marc_filename.gsub('.mrc','.ttl')
29
+ puts "Translating: #{marc_filename} to #{ld_filename}"
30
+ ld_file = File.open(ld_filename,'w')
31
+ # Marc2LinkedData.write_prefixes(ld_file)
32
+ marc_file = File.open(marc_filename,'r')
33
+ auth_count = 0
34
+ auth_records = marc_auth_count(marc_file)
35
+ progress = ProgressBar.create(:total => auth_records, :format => '%a %f |%b>>%i| %P%% %t')
36
+ until marc_file.eof?
37
+ begin
38
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
39
+ raw = marc_file.read(leader[:length])
40
+ if leader[:type] == 'z'
41
+ progress.increment
42
+ record = MARC::Reader.decode(raw)
43
+ # ParseMarcAuthority is a lazy parser, so
44
+ # init only assigns record to an instance var.
45
+ auth = Marc2LinkedData::ParseMarcAuthority.new(record)
46
+ auth_count += 1
47
+ # auth_id = "auth:#{auth.get_id}"
48
+ # triples = nil
49
+ # # TODO: enable additional persistence options
50
+ # # Use data already in redis (if enabled)
51
+ # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
52
+ # if triples.nil?
53
+ # triples = auth.to_ttl # generate new triples
54
+ # # Update redis (if enabled) for triples not read from redis
55
+ # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
56
+ # end
57
+
58
+ triples = auth.to_ttl.lines
59
+ binding.pry if (CONFIG.debug && triples.empty?)
60
+ triples.delete_if {|l| l.chomp.empty? }
61
+ triples.delete_if {|l| l.start_with?('@prefix') } if auth_count > 1
62
+ ld_file.write(triples.join)
63
+ ld_file.flush
64
+ end
65
+ rescue => e
66
+ puts
67
+ puts 'ERROR'
68
+ puts e.message
69
+ puts e.backtrace
70
+ puts record.to_s
71
+ puts
72
+ binding.pry if CONFIG.debug
73
+ end
74
+ end
75
+ marc_file.close
76
+ ld_file.flush
77
+ ld_file.close
78
+ end
79
+
80
+ marc_files = []
81
+ ARGV.each do |filename|
82
+ path = Pathname(filename)
83
+ marc_files.push(path) if path.exist?
84
+ end
85
+ if marc_files.empty?
86
+ puts <<HELP
87
+ #{__FILE__} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
88
+
89
+ Output is RDF triples in a turtle file (.ttl) for every input .mrc file.
90
+ Optional persistence services can be controlled by environment variables.
91
+
92
+ Redis Persistence - based on https://github.com/redis/redis-rb
93
+ - essential options:
94
+ export REDIS4MARC=true # enable redis persistence (default = false)
95
+ - supplementary options:
96
+ Set the REDIS_URL for a custom redis configuration.
97
+ export REDIS_URL="redis://{user}:{password}@{host}:{port}/{db}"
98
+ export REDIS_READ=true # enable redis reads (default = REDIS4MARC || false)
99
+ # faster reading of triples from pre-populated redis data
100
+ export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
101
+ # recent data is updated in redis
102
+
103
+ HELP
104
+ else
105
+ end
106
+
107
+ puts "Logging to: #{CONFIG.log_file}"
108
+ marc_files.each do |path|
109
+ CONFIG.logger.info "Processing: #{path}"
110
+ marc2ld(path.to_s)
111
+ end
112
+
113
+
@@ -0,0 +1,146 @@
1
+
2
+ module Marc2LinkedData
3
+
4
+ class Configuration
5
+
6
+ attr_accessor :debug
7
+
8
+ attr_accessor :field_auth_loc
9
+ attr_accessor :field_auth_isni
10
+ attr_accessor :field_auth_oclc
11
+ attr_accessor :field_auth_viaf
12
+
13
+ attr_accessor :get_isni
14
+ attr_accessor :get_loc
15
+ attr_accessor :get_oclc
16
+ attr_accessor :get_viaf
17
+ attr_accessor :oclc_auth2works
18
+
19
+ attr_accessor :local_loc_user
20
+ attr_accessor :local_loc_pass
21
+ attr_accessor :local_loc_url
22
+
23
+ attr_accessor :prefixes
24
+
25
+ attr_accessor :use_foaf
26
+ attr_accessor :use_schema
27
+
28
+ attr_accessor :redis4marc
29
+ attr_accessor :redis_read
30
+ attr_accessor :redis_write
31
+ attr_accessor :redis
32
+
33
+ attr_accessor :log_file
34
+ attr_accessor :logger
35
+
36
+ def initialize
37
+ @debug = env_boolean('DEBUG')
38
+
39
+ # logging
40
+ log_file = ENV['LOG_FILE'] || 'marc2ld.log'
41
+ log_file = File.absolute_path log_file
42
+ @log_file = log_file
43
+ log_path = File.dirname log_file
44
+ unless File.directory? log_path
45
+ # try to create the log directory
46
+ Dir.mkdir log_path rescue nil
47
+ end
48
+ begin
49
+ log_file = File.new(@log_file, 'w+')
50
+ rescue
51
+ log_file = $stderr
52
+ @log_file = 'STDERR'
53
+ end
54
+ @logger = Logger.new(log_file, shift_age = 'monthly')
55
+ @logger.level = @debug ? Logger::DEBUG : Logger::INFO
56
+
57
+ # RDF prefixes
58
+ @prefixes = {}
59
+ # Library specific prefixes (use .env file or set shell ENV)
60
+ @prefixes['lib'] = ENV['LIB_PREFIX'] || 'http://linked-data.stanford.edu/library/'
61
+ @prefixes['lib_auth'] = "#{prefixes['lib']}authority/"
62
+ @prefixes['lib_cat'] = "#{prefixes['lib']}catalog/"
63
+ # Static Prefixes
64
+ @prefixes['bf'] = 'http://bibframe.org/vocab/'
65
+ @prefixes['foaf'] = 'http://xmlns.com/foaf/0.1/'
66
+ @prefixes['isni'] = 'http://www.isni.org/isni/'
67
+ @prefixes['loc_names'] = 'http://id.loc.gov/authorities/names/'
68
+ @prefixes['loc_subjects'] = 'http://id.loc.gov/authorities/subjects/'
69
+ @prefixes['rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
70
+ @prefixes['rdfs'] = 'http://www.w3.org/2000/01/rdf-schema#'
71
+ @prefixes['schema'] = 'http://schema.org/'
72
+ @prefixes['owl'] = 'http://www.w3.org/2002/07/owl#'
73
+ @prefixes['viaf'] = 'http://viaf.org/viaf/'
74
+
75
+ # Authority parse options
76
+ @field_auth_loc = ENV['FIELD_AUTH_LOC']
77
+ @field_auth_isni = ENV['FIELD_AUTH_ISNI']
78
+ @field_auth_oclc = ENV['FIELD_AUTH_OCLC']
79
+ @field_auth_viaf = ENV['FIELD_AUTH_VIAF']
80
+
81
+ @get_isni = env_boolean('GET_ISNI')
82
+ @get_loc = env_boolean('GET_LOC')
83
+ @get_viaf = env_boolean('GET_VIAF')
84
+ @get_oclc = env_boolean('GET_OCLC')
85
+ @oclc_auth2works = env_boolean('OCLC_AUTH2WORKS')
86
+
87
+ # Vocabulary options
88
+ # foaf:Person or schema:Person or both?
89
+ @use_foaf = env_boolean('USE_FOAF')
90
+ @use_schema = env_boolean('USE_SCHEMA') # schema.org
91
+
92
+ # Local triple store for LOC authority data,
93
+ # accessed via an HTTP API with basic authentication.
94
+ # See downloads at http://id.loc.gov/download/
95
+ @local_loc_user = ENV['LOCAL_LOC_USER']
96
+ @local_loc_pass = ENV['LOCAL_LOC_PASS']
97
+ loc_host = ENV['LOCAL_LOC_HOST']
98
+ loc_port = ENV['LOCAL_LOC_PORT']
99
+ loc_path = ENV['LOCAL_LOC_PATH']
100
+ @local_loc_url = "http://#{loc_host}:#{loc_port}#{loc_path}"
101
+
102
+ # Persistence options
103
+ @redis = nil
104
+ @redis4marc = env_boolean('REDIS4MARC')
105
+ if @redis4marc
106
+ @redis_url = env_boolean('REDIS_URL')
107
+ @redis_read = env_boolean('REDIS_READ')
108
+ @redis_write = env_boolean('REDIS_WRITE')
109
+ redis_config
110
+ else
111
+ @redis_url = nil
112
+ @redis_read = false
113
+ @redis_write = false
114
+ end
115
+ # TODO: provide options for triple stores
116
+ end
117
+
118
+ def env_boolean(var)
119
+ # check if an ENV variable is true, use false as default
120
+ ENV[var].to_s.upcase == 'TRUE' rescue false
121
+ end
122
+
123
+ def redis_config
124
+ if @redis4marc
125
+ # https://github.com/redis/redis-rb
126
+ # storing objects in redis:
127
+ #redis.set "foo", [1, 2, 3].to_json
128
+ #JSON.parse(redis.get("foo"))
129
+ require 'hiredis'
130
+ require 'redis'
131
+ if @redis_url
132
+ # redis url should be of the form "redis://{user}:{password}@{host}:{port}/{db}"
133
+ @redis = Redis.new(:url => @redis_url)
134
+ @redis.ping
135
+ else
136
+ # default is 'redis://127.0.0.1:6379/0'
137
+ @redis = Redis.new
138
+ @redis.ping
139
+ end
140
+ end
141
+ end
142
+
143
+ end
144
+
145
+ end
146
+
@@ -0,0 +1,23 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class Isni < Resource
6
+
7
+ # Interesting slide presentation about ISNI
8
+ # http://www.slideshare.net/JaniferGatenby/viaf-and-isni-ifla-2014-0815
9
+
10
+ PREFIX = 'http://www.isni.org/isni/'
11
+
12
+ def rdf
13
+ # e.g. 'http://www.isni.org/isni/0000000109311081'
14
+ return nil if @iri.nil?
15
+ return @rdf unless @rdf.nil?
16
+ uri4rdf = @iri.to_s + '.rdf'
17
+ @rdf = get_rdf(uri4rdf)
18
+ end
19
+
20
+ end
21
+
22
+ end
23
+
@@ -0,0 +1,17 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class LibAuth < Resource
6
+
7
+ # def rdf
8
+ # return nil if @iri.nil?
9
+ # return @rdf unless @rdf.nil?
10
+ # uri4rdf = @iri.to_s + '.rdf'
11
+ # @rdf = get_rdf(uri4rdf)
12
+ # end
13
+
14
+ end
15
+
16
+ end
17
+
@@ -0,0 +1,91 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class Loc < Resource
6
+
7
+ PREFIX = 'http://id.loc.gov/authorities/'
8
+ PREFIX_NAMES = "#{PREFIX}names/"
9
+ PREFIX_SUBJECTS = "#{PREFIX}subjects/"
10
+
11
+ # def id
12
+ # return nil if @iri.nil?
13
+ # @id ||= @iri.basename
14
+ # # Could get id from rdf, but that incurs costs for RDF retrieval and parsing etc.
15
+ # #oclc_id = '<identifiers:oclcnum>oca04921729</identifiers:oclcnum>'
16
+ # #<identifiers:lccn>no 99010609</identifiers:lccn>
17
+ # #<identifiers:oclcnum>oca04921729</identifiers:oclcnum>
18
+ # end
19
+
20
+ def rdf
21
+ return nil if iri.nil?
22
+ return @rdf unless @rdf.nil?
23
+ uri4rdf = iri.to_s + '.rdf'
24
+ @rdf = get_rdf(uri4rdf)
25
+ end
26
+
27
+ def label
28
+ label_predicate = '<http://www.loc.gov/mads/rdf/v1#authoritativeLabel>'
29
+ query = SPARQL.parse("SELECT * WHERE { <#{@iri}> #{label_predicate} ?o }")
30
+ rdf.query(query).first[:o].to_s rescue nil
31
+ end
32
+
33
+ def authority?
34
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#Authority' }.length > 0
35
+ end
36
+
37
+ def deprecated?
38
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#DeprecatedAuthority' }.length > 0
39
+ end
40
+
41
+ def conference?
42
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#ConferenceName' }.length > 0
43
+ end
44
+
45
+ def corporation?
46
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#CorporateName' }.length > 0
47
+ end
48
+
49
+ def name_title?
50
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#NameTitle' }.length > 0
51
+ end
52
+
53
+ def person?
54
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#PersonalName' }.length > 0
55
+ # iri_types.filter {|s| s[:o] =~ /PersonalName/ }.length > 0
56
+ # obj = rdf_find_object 'PersonalName'
57
+ # obj.nil? ? false : true
58
+ end
59
+
60
+ def place?
61
+ iri_types.filter {|s| s[:o] == 'http://www.loc.gov/mads/rdf/v1#Geographic' }.length > 0
62
+ end
63
+
64
+ def get_oclc_identity
65
+ # Try to get OCLC URI from LOC ID
66
+ # http://oclc.org/developer/develop/web-services/worldcat-identities.en.html
67
+ # e.g. http://www.worldcat.org/identities/lccn-n79044803/
68
+ # e.g. http://www.worldcat.org/identities/lccn-n79044798/
69
+ return @oclc_iri unless @oclc_iri.nil?
70
+ oclc_url = URI.encode('http://www.worldcat.org/identities/lccn-' + id + '/')
71
+ @oclc_iri = resolve_external_auth(oclc_url)
72
+ # TODO: OCLC might redirect and then provide a 'fast' URI for obsolete identity records.
73
+ end
74
+
75
+ def get_viaf
76
+ return @viaf_iri unless @viaf_iri.nil?
77
+ # Try to get VIAF from LOC sourceID
78
+ # LOC statement with VIAF URI, e.g.:
79
+ # s: <http://id.loc.gov/authorities/names/n79046291>
80
+ # p: <http://www.loc.gov/mads/rdf/v1#hasExactExternalAuthority>
81
+ # o: <http://viaf.org/viaf/sourceID/LC%7Cn+79046291#skos:Concept> .
82
+ #return nil unless rdf_valid?
83
+ #@viaf_iri ||= rdf_find_object 'viaf'
84
+ viaf_url = URI.encode('http://viaf.org/viaf/sourceID/LC|' + id + '#skos:Concept')
85
+ @viaf_iri = resolve_external_auth(viaf_url)
86
+ end
87
+
88
+ end
89
+
90
+ end
91
+
@@ -0,0 +1,44 @@
1
+ require_relative 'oclc_resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcCreativeWork < OclcResource
6
+
7
+ PREFIX = 'http://www.worldcat.org/oclc/'
8
+
9
+ def get_works
10
+ # assume an exampleOfWork can only ever link to one work?
11
+ q = query_work(@iri)
12
+ works = rdf.query(q).collect {|s| s[:o] }
13
+ if works.empty?
14
+ # OCLC data is inconsistent in use of 'www.' in IRI, so try again.
15
+ # The OclcResource coerces @iri so it includes 'www.', so try without it.
16
+ uri = @iri.to_s.gsub('www.','')
17
+ q = query_work(uri)
18
+ works = rdf.query(q).collect {|s| s[:o] }
19
+ end
20
+ if works.empty?
21
+ # Keep the 'www.', cast the ID to an integer.
22
+ uri = @iri.to_s.gsub(id, id.to_i.to_s)
23
+ q = query_work(uri)
24
+ works = rdf.query(q).collect {|s| s[:o] }
25
+ end
26
+ if works.empty?
27
+ # Remove the 'www.' AND cast the ID to an integer.
28
+ uri = @iri.to_s.gsub('www.','').gsub(id, id.to_i.to_s)
29
+ q = query_work(uri)
30
+ works = rdf.query(q).collect {|s| s[:o] }
31
+ end
32
+ works
33
+ end
34
+
35
+ def query_work(uri)
36
+ SPARQL.parse("SELECT * WHERE { <#{uri}> <http://schema.org/exampleOfWork> ?o }")
37
+ end
38
+
39
+ # TODO: get ISBN?
40
+
41
+ end
42
+
43
+ end
44
+
@@ -0,0 +1,46 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcIdentity < Resource
6
+
7
+ PREFIX = 'http://www.worldcat.org/identities/'
8
+
9
+ def rdf
10
+ # e.g. 'http://www.worldcat.org/identities/lccn-n79044803/'
11
+ # the html returned contains RDFa data
12
+ return nil if @iri.nil?
13
+ return @rdf unless @rdf.nil?
14
+ uri4rdf = @iri.to_s
15
+ uri4rdf += '/' unless uri4rdf.end_with? '/'
16
+ @rdf = get_rdf(uri4rdf)
17
+ end
18
+
19
+ # def get_xml
20
+ # begin
21
+ # return @xml unless @xml.nil?
22
+ # http = Net::HTTP.new @iri.host
23
+ # resp = http.get(@iri.path, {'Accept' => 'application/xml'})
24
+ # case resp.code
25
+ # when '301','302','303'
26
+ # #301 Moved Permanently; 302 Moved Temporarily; 303 See Other
27
+ # resp = http.get(resp['location'], {'Accept' => 'application/xml'})
28
+ # end
29
+ # if resp.code != '200'
30
+ # raise
31
+ # end
32
+ # @xml = resp.body
33
+ # rescue
34
+ # puts 'ERROR: Failed to request OCLC identity xml.'
35
+ # end
36
+ # end
37
+
38
+ def creative_works
39
+ q = SPARQL.parse('SELECT * WHERE { ?oclcWork a <http://schema.org/CreativeWork> }')
40
+ rdf.query(q).collect {|s| s[:oclcWork] }
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+
@@ -0,0 +1,79 @@
1
+ require_relative 'resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcResource < Resource
6
+
7
+ PREFIX = 'http://www.worldcat.org/oclc/'
8
+
9
+ def initialize(uri=nil)
10
+ # Ensure the OCLC IRI contains 'www' in the host name.
11
+ unless uri =~ /www\./
12
+ uri = uri.to_s.gsub('worldcat.org','www.worldcat.org')
13
+ end
14
+ super(uri)
15
+ end
16
+
17
+ def rdf
18
+ # e.g. 'http://worldcat.org/oclc/004957186'
19
+ # also 'http://www.worldcat.org/oclc/004957186'
20
+ return nil if @iri.nil?
21
+ return @rdf unless @rdf.nil?
22
+ uri4rdf = @iri.to_s
23
+ uri4rdf += '.rdf' unless uri4rdf.end_with? '.rdf'
24
+ @rdf = get_rdf(uri4rdf)
25
+ end
26
+
27
+ def book?
28
+ iri_types.filter {|s| s[:o] == 'http://schema.org/Book' }.length > 0
29
+ end
30
+
31
+ def creator?(uri)
32
+ creators.include? RDF::URI.new(uri)
33
+ end
34
+
35
+ def contributor?(uri)
36
+ contributors.include? RDF::URI.new(uri)
37
+ end
38
+
39
+ def editor?(uri)
40
+ editors.include? RDF::URI.new(uri)
41
+ end
42
+
43
+ def media_object?
44
+ iri_types.filter {|s| s[:o] == 'http://schema.org/MediaObject' }.length > 0
45
+ end
46
+
47
+ def about
48
+ q = SPARQL.parse('SELECT * WHERE { ?s <http://schema.org/about> ?o }')
49
+ rdf.query(q)
50
+ end
51
+
52
+ def creators
53
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/creator> ?o }")
54
+ rdf.query(q).collect {|s| s[:o] }
55
+ end
56
+
57
+ def contributors
58
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/contributor> ?o }")
59
+ rdf.query(q).collect {|s| s[:o] }
60
+ end
61
+
62
+ def editors
63
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/editor> ?o }")
64
+ rdf.query(q).collect {|s| s[:o] }
65
+ end
66
+
67
+ def publishers
68
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/publisher> ?o }")
69
+ rdf.query(q).collect {|s| s[:o] }
70
+ end
71
+
72
+ def isbns
73
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/isbn> ?o }")
74
+ rdf.query(q).collect {|s| s[:o] }
75
+ end
76
+ end
77
+
78
+ end
79
+
@@ -0,0 +1,19 @@
1
+ require_relative 'oclc_resource'
2
+
3
+ module Marc2LinkedData
4
+
5
+ class OclcWork < OclcResource
6
+
7
+ # OCLC is inconsistent with use of 'www' in URIs
8
+ #PREFIX = 'http://www.worldcat.org/entity/work/id/'
9
+ PREFIX = 'http://worldcat.org/entity/work/id/'
10
+
11
+ def example_works
12
+ q = SPARQL.parse("SELECT * WHERE { <#{@iri}> <http://schema.org/workExample> ?o }")
13
+ rdf.query(q).collect {|s| s[:o] }
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+