marc2linkeddata 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_example +4 -0
- data/README.md +131 -21
- data/bin/marcAuthority2LD +1 -0
- data/lib/marc2linkeddata/configuration.rb +3 -0
- data/lib/marc2linkeddata/parseMarcAuthority.rb +1 -14
- data/lib/marc2linkeddata/resource.rb +10 -25
- data/lib/marc2linkeddata.rb +8 -11
- data/marc2linkeddata.gemspec +1 -1
- data/spec/marc2linkeddata/loc_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30923ebbb08cf2eb45cbe20a79bfc115fb8f695a
|
4
|
+
data.tar.gz: 1a98f477c2f8c9b61b4efb1f8c2a4d171862f2f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c92c1369d3e39d46df6f712a94c27189c848c36b04c529fe1d9cfd2e341da9a1c8fe56f974fb7c011672fb59369572f9427307c077de9b1db0d29f6ced7a7dce
|
7
|
+
data.tar.gz: bf3e016ec9a3b01c6ed7b1bede449ee1c7791fb34fca1150d1e1efde09eb6f94436692e7e4461f32b9458ebd34890b16428bff7e532d8a0aac69a01a13e3a170
|
data/.env_example
CHANGED
@@ -11,8 +11,12 @@
|
|
11
11
|
# Uncomment and set values as required. See used settings in
|
12
12
|
# lib/marc2linkeddata/configuration.rb
|
13
13
|
|
14
|
+
# Enable debug logging and breakpoints at problematic code points.
|
14
15
|
export DEBUG=false
|
15
16
|
|
17
|
+
# Only read X MARC records, for testing purposes?
|
18
|
+
export TEST_RECORDS=0 # 0 for all records
|
19
|
+
|
16
20
|
export LOG_FILE='marc2ld.log'
|
17
21
|
export LIB_PREFIX=http://linked-data.example.org/library/
|
18
22
|
|
data/README.md
CHANGED
@@ -6,10 +6,23 @@ Utilities for translating MARC21 into linked data. The project has
|
|
6
6
|
focused on authority records (as of 2015).
|
7
7
|
|
8
8
|
It has config options that can be enabled to increase the amount of data retrieved.
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
9
|
+
All config options are set by environment variables. The .env_example file documents
|
10
|
+
the options available and how to use a .env file; the `marc2LD_config` utility will
|
11
|
+
copy the .env_example file provided into the current path.
|
12
|
+
|
13
|
+
Without any HTTP retrieval of RDF metadata, using only data in a MARC record, it can
|
14
|
+
translate 100,000 authority records in about 5-6 min on a current laptop system. The
|
15
|
+
config options allow specification of MARC fields that may already contain resource links.
|
16
|
+
With HTTP/RDF retrieval options enabled, it can take a lot longer (days) and the
|
17
|
+
RDF providers may not be happy about a barrage of requests.
|
18
|
+
|
19
|
+
File IO is the most expensive operation in the MARC-only mode (it helps to have a solid
|
20
|
+
state drive with high IO performance). In the RDF-HTTP retrieval mode, it may help
|
21
|
+
to enable threading for concurrent retrieval of RDF resources. However, it's still
|
22
|
+
relatively slow (exploring options for caching and local downloads of RDF data).
|
23
|
+
Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
|
24
|
+
are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
|
25
|
+
language support (such as Array#delete_if).
|
13
26
|
|
14
27
|
The current output is to the file system, but it should be easy to incorporate
|
15
28
|
and configure alternatives by using the RDF.rb facilities for connecting to a
|
@@ -18,12 +31,8 @@ exploration hasn't matured much, mainly because there is no 'cache-expiry' data
|
|
18
31
|
yet and because it would be better to use an RDF.rb extension of some
|
19
32
|
kind (for redis, mongodb, etc) or to use a triple store/solr platform.
|
20
33
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
|
25
|
-
are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
|
26
|
-
language support (such as Array#delete_if).
|
34
|
+
TODO: Develop on additional example datasets, to evaluate the generality and robustness
|
35
|
+
of the utilities.
|
27
36
|
|
28
37
|
TODO: A significant problem to solve is effective caching or mirrors for linked data.
|
29
38
|
The retrieval should inspect any HTTP cache headers that might be available and
|
@@ -54,8 +63,8 @@ Install with rbenv (on linux)
|
|
54
63
|
echo 'eval "$(rbenv init -)"' >> ~/.bash_profile
|
55
64
|
source .bash_profile
|
56
65
|
git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build
|
57
|
-
rbenv install 2.
|
58
|
-
rbenv global 2.
|
66
|
+
rbenv install 2.2.0 # or the latest ruby available
|
67
|
+
rbenv global 2.2.0
|
59
68
|
rbenv rehash
|
60
69
|
gem install bundle
|
61
70
|
gem install marc2linkeddata
|
@@ -63,20 +72,115 @@ Install with rbenv (on linux)
|
|
63
72
|
Configure
|
64
73
|
|
65
74
|
# set env values and/or create or modify a .env file
|
66
|
-
# see the .env_example file for details
|
67
|
-
marc2LD_config
|
75
|
+
# see the .env_example file for details.
|
68
76
|
# Performance will slow with more retrieval of linked
|
69
77
|
# data resources, such as OCLC works for authorities.
|
78
|
+
marc2LD_config
|
70
79
|
|
71
80
|
Scripting
|
72
81
|
|
73
82
|
# First configure (see details above).
|
74
83
|
# Translate a MARC21 authority file to a turtle file.
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
84
|
+
# It's assumed that '*.mrc' files contain multiple MARC21
|
85
|
+
# records and the record identifier is in field 001.
|
86
|
+
# marcAuthority2LD [ authfile1.mrc .. authfileN.mrc ]
|
87
|
+
marcAuthority2LD auth.mrc
|
88
|
+
|
89
|
+
# Check the syntax of the output turtle files.
|
90
|
+
touch turtle_syntax_checks.log
|
91
|
+
for f in $(find ./auth_turtle/ -type f -name '.ttl'); do
|
92
|
+
rapper -c -i turtle $f >> turtle_syntax_checks.log 2>&1
|
93
|
+
done
|
94
|
+
|
95
|
+
Example Output Files
|
96
|
+
|
97
|
+
- In this example, only data in the MARC record was used, without any RDF link
|
98
|
+
resolution or retrieval. The example MARC record already contained links to
|
99
|
+
VIAF and ISNI IRIs (these 9xx MARC fields are identified in the configuration).
|
100
|
+
|
101
|
+
@prefix owl: <http://www.w3.org/2002/07/owl#> .
|
102
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
103
|
+
@prefix schema: <http://schema.org/> .
|
104
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
105
|
+
<http://linked-data.stanford.edu/library/authority/N79044798> a schema:Person;
|
106
|
+
schema:name "Byrnes, Christopher I.,";
|
107
|
+
owl:sameAs <http://id.loc.gov/authorities/names/n79044798>,
|
108
|
+
<http://viaf.org/viaf/108317368>,
|
109
|
+
<http://www.isni.org/0000000109311081> .
|
110
|
+
|
111
|
+
- In this example, all the RDF link resolution and retrieval was enabled. Also, the
|
112
|
+
OCLC works for this authority were resolved. The result is an 'authority index' into LOD,
|
113
|
+
including associated works. Although some of the RDF was retrieved in the process (and
|
114
|
+
could be cached in a local triple store), the output record is designed to be an LOD index
|
115
|
+
only. The index could be stored in a local triple store, to be leveraged by local clients
|
116
|
+
that may retrieve and use additional data from the RDF links. Sharing such an 'LOD index'
|
117
|
+
in a distributed network database could be very useful and open opportunities for institutions
|
118
|
+
to collaborate on scaling the link resolution and maintenance issues.
|
119
|
+
|
120
|
+
@prefix owl: <http://www.w3.org/2002/07/owl#> .
|
121
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
122
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
123
|
+
@prefix schema: <http://schema.org/> .
|
124
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
125
|
+
<http://linked-data.example.org/library/authority/N79044798> a schema:Person;
|
126
|
+
schema:familyName "Byrnes";
|
127
|
+
schema:givenName "Christopher Ian",
|
128
|
+
"Christopher I";
|
129
|
+
schema:name "Byrnes, Christopher I., 1949-";
|
130
|
+
owl:sameAs <http://id.loc.gov/authorities/names/n79044798>,
|
131
|
+
<http://viaf.org/viaf/108317368>,
|
132
|
+
<http://www.isni.org/0000000109311081> .
|
133
|
+
<http://id.loc.gov/authorities/names/n79044798> owl:sameAs <http://www.worldcat.org/identities/lccn-n79044798> .
|
134
|
+
<http://www.worldcat.org/identities/lccn-n79044798> rdfs:seeAlso <http://www.worldcat.org/oclc/747413718>,
|
135
|
+
<http://www.worldcat.org/oclc/017649403>,
|
136
|
+
<http://www.worldcat.org/oclc/004933024>,
|
137
|
+
<http://www.worldcat.org/oclc/007170722>,
|
138
|
+
<http://www.worldcat.org/oclc/006626542>,
|
139
|
+
<http://www.worldcat.org/oclc/050868185>,
|
140
|
+
<http://www.worldcat.org/oclc/013525712>,
|
141
|
+
<http://www.worldcat.org/oclc/013700764>,
|
142
|
+
<http://www.worldcat.org/oclc/036387153>,
|
143
|
+
<http://www.worldcat.org/oclc/013525674>,
|
144
|
+
<http://www.worldcat.org/oclc/013700768>,
|
145
|
+
<http://www.worldcat.org/oclc/018380395>,
|
146
|
+
<http://www.worldcat.org/oclc/018292079>,
|
147
|
+
<http://www.worldcat.org/oclc/023969230>,
|
148
|
+
<http://www.worldcat.org/oclc/035911289>,
|
149
|
+
<http://www.worldcat.org/oclc/495781917>,
|
150
|
+
<http://www.worldcat.org/oclc/727657045>,
|
151
|
+
<http://www.worldcat.org/oclc/782013318>,
|
152
|
+
<http://www.worldcat.org/oclc/037671494>,
|
153
|
+
<http://www.worldcat.org/oclc/751661734>,
|
154
|
+
<http://www.worldcat.org/oclc/800600611> .
|
155
|
+
|
156
|
+
- In addition, when the option to resolve OCLC works is enabled (OCLC_AUTH2WORKS option), the
|
157
|
+
following triples were added to those above.
|
158
|
+
|
159
|
+
<http://www.worldcat.org/oclc/004933024> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/796991413> .
|
160
|
+
<http://www.worldcat.org/oclc/006626542> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/111527266> .
|
161
|
+
<http://www.worldcat.org/oclc/007170722> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/144285064> .
|
162
|
+
<http://www.worldcat.org/oclc/013525674> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/7358848> .
|
163
|
+
<http://www.worldcat.org/oclc/013525712> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/7360091> .
|
164
|
+
<http://www.worldcat.org/oclc/013700764> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366036025> .
|
165
|
+
<http://www.worldcat.org/oclc/013700768> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366036042> .
|
166
|
+
<http://www.worldcat.org/oclc/017649403> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/866252320> .
|
167
|
+
<http://www.worldcat.org/oclc/018292079> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/836712068> .
|
168
|
+
<http://www.worldcat.org/oclc/018380395> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/365996343> .
|
169
|
+
<http://www.worldcat.org/oclc/023969230> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/890420837> .
|
170
|
+
<http://www.worldcat.org/oclc/035911289> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/355875201> .
|
171
|
+
<http://www.worldcat.org/oclc/036387153> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/622568> .
|
172
|
+
<http://www.worldcat.org/oclc/037671494> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/9216290> .
|
173
|
+
<http://www.worldcat.org/oclc/050868185> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366714531> .
|
174
|
+
<http://www.worldcat.org/oclc/495781917> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
175
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/994448191> .
|
176
|
+
<http://www.worldcat.org/oclc/727657045> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
177
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/1811109792> .
|
178
|
+
<http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/994448191> .
|
179
|
+
<http://www.worldcat.org/oclc/751661734> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
180
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/1816359357> .
|
181
|
+
<http://www.worldcat.org/oclc/782013318> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
182
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/146829946> .
|
183
|
+
<http://www.worldcat.org/oclc/889440750> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/2061462527> .
|
80
184
|
|
81
185
|
|
82
186
|
Ruby Library Use
|
@@ -93,7 +197,8 @@ Ruby Library Use
|
|
93
197
|
record = MARC::Reader.decode(raw)
|
94
198
|
auth = ParseMarcAuthority.new(record)
|
95
199
|
auth_id = "auth:#{auth.get_id}"
|
96
|
-
|
200
|
+
graph = auth.graph
|
201
|
+
puts graph.to_ttl
|
97
202
|
end
|
98
203
|
end
|
99
204
|
|
@@ -105,7 +210,12 @@ Development
|
|
105
210
|
./bin/test.sh
|
106
211
|
cp .env_example .env # and edit .env
|
107
212
|
# develop code and/or bin scripts; run bin scripts, e.g.
|
108
|
-
.binstubs/marcAuthority2LD auth.
|
213
|
+
.binstubs/marcAuthority2LD auth.mrc
|
214
|
+
# Look for results in auth_turtle/*.ttl files.
|
215
|
+
# see also full example script in
|
216
|
+
#.binstubs/run_test_data.sh
|
217
|
+
# which includes shell script for basic stats and
|
218
|
+
# using rapper to check the file output syntax.
|
109
219
|
|
110
220
|
|
111
221
|
# License
|
data/bin/marcAuthority2LD
CHANGED
@@ -4,6 +4,7 @@ module Marc2LinkedData
|
|
4
4
|
class Configuration
|
5
5
|
|
6
6
|
attr_accessor :debug
|
7
|
+
attr_accessor :test_records
|
7
8
|
|
8
9
|
attr_accessor :threads
|
9
10
|
attr_accessor :thread_limit
|
@@ -38,6 +39,8 @@ module Marc2LinkedData
|
|
38
39
|
|
39
40
|
def initialize
|
40
41
|
@debug = env_boolean('DEBUG')
|
42
|
+
@test_records = ENV['TEST_RECORDS'].to_i
|
43
|
+
|
41
44
|
@threads = env_boolean('THREADS')
|
42
45
|
@thread_limit = ENV['THREAD_LIMIT'].to_i || 25
|
43
46
|
|
@@ -95,20 +95,7 @@ module Marc2LinkedData
|
|
95
95
|
unless loc_iri.nil?
|
96
96
|
# Verify the URL (used HEAD so it's as fast as possible)
|
97
97
|
@@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
|
98
|
-
|
99
|
-
case res.code
|
100
|
-
when '200'
|
101
|
-
# it's good to go
|
102
|
-
when '301'
|
103
|
-
# use the redirection
|
104
|
-
loc_iri = res['location']
|
105
|
-
when '302','303'
|
106
|
-
#302 Moved Temporarily
|
107
|
-
#303 See Other
|
108
|
-
# Use the current URL, most get requests will follow a 302 or 303
|
109
|
-
else
|
110
|
-
loc_iri = nil
|
111
|
-
end
|
98
|
+
loc_iri = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
|
112
99
|
end
|
113
100
|
if loc_iri.nil?
|
114
101
|
# If it gets here, it's a problem.
|
@@ -84,34 +84,19 @@ module Marc2LinkedData
|
|
84
84
|
|
85
85
|
def resolve_external_auth(url)
|
86
86
|
begin
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
#301 Moved Permanently
|
94
|
-
url = res['location']
|
95
|
-
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
96
|
-
return url
|
97
|
-
when '302','303'
|
98
|
-
#302 Moved Temporarily
|
99
|
-
#303 See Other
|
100
|
-
# Use the current URL, most get requests will follow a 302 or 303
|
101
|
-
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
102
|
-
return url
|
103
|
-
when '404'
|
104
|
-
@@config.logger.warn "#{@iri}\t// #{url}"
|
105
|
-
return nil
|
106
|
-
else
|
107
|
-
# WTF
|
108
|
-
binding.pry if @@config.debug
|
109
|
-
@@config.logger.error "unknown http response code (#{res.code}) for #{@iri}"
|
110
|
-
return nil
|
87
|
+
# RestClient does all the response code handling and redirection.
|
88
|
+
url = Marc2LinkedData.http_head_request(url)
|
89
|
+
if url.nil?
|
90
|
+
@@config.logger.warn "#{@iri}\t// #{url}"
|
91
|
+
else
|
92
|
+
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
111
93
|
end
|
112
94
|
rescue
|
113
|
-
|
95
|
+
binding.pry if @@config.debug
|
96
|
+
@@config.logger.error "unknown http error for #{@iri}"
|
97
|
+
url = nil
|
114
98
|
end
|
99
|
+
url
|
115
100
|
end
|
116
101
|
|
117
102
|
def same_as
|
data/lib/marc2linkeddata.rb
CHANGED
@@ -23,23 +23,20 @@ module Marc2LinkedData
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def self.http_head_request(url)
|
26
|
-
uri =
|
26
|
+
uri = nil
|
27
27
|
begin
|
28
|
-
|
29
|
-
|
30
|
-
else
|
31
|
-
req = Net::HTTP::Head.new(uri)
|
32
|
-
end
|
33
|
-
Net::HTTP.start(uri.host, uri.port) {|http| http.request req }
|
28
|
+
response = RestClient.head(url)
|
29
|
+
uri = response.args[:url]
|
34
30
|
rescue
|
35
|
-
@configuration.logger.error "
|
31
|
+
@configuration.logger.error "RestClient.head failed for #{url}"
|
36
32
|
begin
|
37
|
-
|
33
|
+
response = RestClient.get(url)
|
34
|
+
uri = response.args[:url]
|
38
35
|
rescue
|
39
|
-
@configuration.logger.error "
|
40
|
-
nil
|
36
|
+
@configuration.logger.error "RestClient.get failed for #{url}"
|
41
37
|
end
|
42
38
|
end
|
39
|
+
uri
|
43
40
|
end
|
44
41
|
|
45
42
|
def self.write_prefixes(file)
|
data/marc2linkeddata.gemspec
CHANGED