marc2linkeddata 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env_example +4 -0
- data/README.md +131 -21
- data/bin/marcAuthority2LD +1 -0
- data/lib/marc2linkeddata/configuration.rb +3 -0
- data/lib/marc2linkeddata/parseMarcAuthority.rb +1 -14
- data/lib/marc2linkeddata/resource.rb +10 -25
- data/lib/marc2linkeddata.rb +8 -11
- data/marc2linkeddata.gemspec +1 -1
- data/spec/marc2linkeddata/loc_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30923ebbb08cf2eb45cbe20a79bfc115fb8f695a
|
4
|
+
data.tar.gz: 1a98f477c2f8c9b61b4efb1f8c2a4d171862f2f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c92c1369d3e39d46df6f712a94c27189c848c36b04c529fe1d9cfd2e341da9a1c8fe56f974fb7c011672fb59369572f9427307c077de9b1db0d29f6ced7a7dce
|
7
|
+
data.tar.gz: bf3e016ec9a3b01c6ed7b1bede449ee1c7791fb34fca1150d1e1efde09eb6f94436692e7e4461f32b9458ebd34890b16428bff7e532d8a0aac69a01a13e3a170
|
data/.env_example
CHANGED
@@ -11,8 +11,12 @@
|
|
11
11
|
# Uncomment and set values as required. See used settings in
|
12
12
|
# lib/marc2linkeddata/configuration.rb
|
13
13
|
|
14
|
+
# Enable debug logging and breakpoints at problematic code points.
|
14
15
|
export DEBUG=false
|
15
16
|
|
17
|
+
# Only read X MARC records, for testing purposes?
|
18
|
+
export TEST_RECORDS=0 # 0 for all records
|
19
|
+
|
16
20
|
export LOG_FILE='marc2ld.log'
|
17
21
|
export LIB_PREFIX=http://linked-data.example.org/library/
|
18
22
|
|
data/README.md
CHANGED
@@ -6,10 +6,23 @@ Utilities for translating MARC21 into linked data. The project has
|
|
6
6
|
focused on authority records (as of 2015).
|
7
7
|
|
8
8
|
It has config options that can be enabled to increase the amount of data retrieved.
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
9
|
+
All config options are set by environment variables. The .env_example file documents
|
10
|
+
the options available and how to use a .env file; the `marc2LD_config` utility will
|
11
|
+
copy the .env_example file provided into the current path.
|
12
|
+
|
13
|
+
Without any HTTP retrieval of RDF metadata, using only data in a MARC record, it can
|
14
|
+
translate 100,000 authority records in about 5-6 min on a current laptop system. The
|
15
|
+
config options allow specification of MARC fields that may already contain resource links.
|
16
|
+
With HTTP/RDF retrieval options enabled, it can take a lot longer (days) and the
|
17
|
+
RDF providers may not be happy about a barrage of requests.
|
18
|
+
|
19
|
+
File IO is the most expensive operation in the MARC-only mode (it helps to have a solid
|
20
|
+
state drive with high IO performance). In the RDF-HTTP retrieval mode, it may help
|
21
|
+
to enable threading for concurrent retrieval of RDF resources. However, it's still
|
22
|
+
relatively slow (exploring options for caching and local downloads of RDF data).
|
23
|
+
Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
|
24
|
+
are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
|
25
|
+
language support (such as Array#delete_if).
|
13
26
|
|
14
27
|
The current output is to the file system, but it should be easy to incorporate
|
15
28
|
and configure alternatives by using the RDF.rb facilities for connecting to a
|
@@ -18,12 +31,8 @@ exploration hasn't matured much, mainly because there is no 'cache-expiry' data
|
|
18
31
|
yet and because it would be better to use an RDF.rb extension of some
|
19
32
|
kind (for redis, mongodb, etc) or to use a triple store/solr platform.
|
20
33
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
|
25
|
-
are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
|
26
|
-
language support (such as Array#delete_if).
|
34
|
+
TODO: Develop on additional example datasets, to evaluate the generality and robustness
|
35
|
+
of the utilities.
|
27
36
|
|
28
37
|
TODO: A significant problem to solve is effective caching or mirrors for linked data.
|
29
38
|
The retrieval should inspect any HTTP cache headers that might be available and
|
@@ -54,8 +63,8 @@ Install with rbenv (on linux)
|
|
54
63
|
echo 'eval "$(rbenv init -)"' >> ~/.bash_profile
|
55
64
|
source .bash_profile
|
56
65
|
git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build
|
57
|
-
rbenv install 2.
|
58
|
-
rbenv global 2.
|
66
|
+
rbenv install 2.2.0 # or the latest ruby available
|
67
|
+
rbenv global 2.2.0
|
59
68
|
rbenv rehash
|
60
69
|
gem install bundle
|
61
70
|
gem install marc2linkeddata
|
@@ -63,20 +72,115 @@ Install with rbenv (on linux)
|
|
63
72
|
Configure
|
64
73
|
|
65
74
|
# set env values and/or create or modify a .env file
|
66
|
-
# see the .env_example file for details
|
67
|
-
marc2LD_config
|
75
|
+
# see the .env_example file for details.
|
68
76
|
# Performance will slow with more retrieval of linked
|
69
77
|
# data resources, such as OCLC works for authorities.
|
78
|
+
marc2LD_config
|
70
79
|
|
71
80
|
Scripting
|
72
81
|
|
73
82
|
# First configure (see details above).
|
74
83
|
# Translate a MARC21 authority file to a turtle file.
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
84
|
+
# It's assumed that '*.mrc' files contain multiple MARC21
|
85
|
+
# records and the record identifier is in field 001.
|
86
|
+
# marcAuthority2LD [ authfile1.mrc .. authfileN.mrc ]
|
87
|
+
marcAuthority2LD auth.mrc
|
88
|
+
|
89
|
+
# Check the syntax of the output turtle files.
|
90
|
+
touch turtle_syntax_checks.log
|
91
|
+
for f in $(find ./auth_turtle/ -type f -name '.ttl'); do
|
92
|
+
rapper -c -i turtle $f >> turtle_syntax_checks.log 2>&1
|
93
|
+
done
|
94
|
+
|
95
|
+
Example Output Files
|
96
|
+
|
97
|
+
- In this example, only data in the MARC record was used, without any RDF link
|
98
|
+
resolution or retrieval. The example MARC record already contained links to
|
99
|
+
VIAF and ISNI IRIs (these 9xx MARC fields are identified in the configuration).
|
100
|
+
|
101
|
+
@prefix owl: <http://www.w3.org/2002/07/owl#> .
|
102
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
103
|
+
@prefix schema: <http://schema.org/> .
|
104
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
105
|
+
<http://linked-data.stanford.edu/library/authority/N79044798> a schema:Person;
|
106
|
+
schema:name "Byrnes, Christopher I.,";
|
107
|
+
owl:sameAs <http://id.loc.gov/authorities/names/n79044798>,
|
108
|
+
<http://viaf.org/viaf/108317368>,
|
109
|
+
<http://www.isni.org/0000000109311081> .
|
110
|
+
|
111
|
+
- In this example, all the RDF link resolution and retrieval was enabled. Also, the
|
112
|
+
OCLC works for this authority were resolved. The result is an 'authority index' into LOD,
|
113
|
+
including associated works. Although some of the RDF was retrieved in the process (and
|
114
|
+
could be cached in a local triple store), the output record is designed to be an LOD index
|
115
|
+
only. The index could be stored in a local triple store, to be leveraged by local clients
|
116
|
+
that may retrieve and use additional data from the RDF links. Sharing such an 'LOD index'
|
117
|
+
in a distributed network database could be very useful and open opportunities for institutions
|
118
|
+
to collaborate on scaling the link resolution and maintenance issues.
|
119
|
+
|
120
|
+
@prefix owl: <http://www.w3.org/2002/07/owl#> .
|
121
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
122
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
123
|
+
@prefix schema: <http://schema.org/> .
|
124
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
125
|
+
<http://linked-data.example.org/library/authority/N79044798> a schema:Person;
|
126
|
+
schema:familyName "Byrnes";
|
127
|
+
schema:givenName "Christopher Ian",
|
128
|
+
"Christopher I";
|
129
|
+
schema:name "Byrnes, Christopher I., 1949-";
|
130
|
+
owl:sameAs <http://id.loc.gov/authorities/names/n79044798>,
|
131
|
+
<http://viaf.org/viaf/108317368>,
|
132
|
+
<http://www.isni.org/0000000109311081> .
|
133
|
+
<http://id.loc.gov/authorities/names/n79044798> owl:sameAs <http://www.worldcat.org/identities/lccn-n79044798> .
|
134
|
+
<http://www.worldcat.org/identities/lccn-n79044798> rdfs:seeAlso <http://www.worldcat.org/oclc/747413718>,
|
135
|
+
<http://www.worldcat.org/oclc/017649403>,
|
136
|
+
<http://www.worldcat.org/oclc/004933024>,
|
137
|
+
<http://www.worldcat.org/oclc/007170722>,
|
138
|
+
<http://www.worldcat.org/oclc/006626542>,
|
139
|
+
<http://www.worldcat.org/oclc/050868185>,
|
140
|
+
<http://www.worldcat.org/oclc/013525712>,
|
141
|
+
<http://www.worldcat.org/oclc/013700764>,
|
142
|
+
<http://www.worldcat.org/oclc/036387153>,
|
143
|
+
<http://www.worldcat.org/oclc/013525674>,
|
144
|
+
<http://www.worldcat.org/oclc/013700768>,
|
145
|
+
<http://www.worldcat.org/oclc/018380395>,
|
146
|
+
<http://www.worldcat.org/oclc/018292079>,
|
147
|
+
<http://www.worldcat.org/oclc/023969230>,
|
148
|
+
<http://www.worldcat.org/oclc/035911289>,
|
149
|
+
<http://www.worldcat.org/oclc/495781917>,
|
150
|
+
<http://www.worldcat.org/oclc/727657045>,
|
151
|
+
<http://www.worldcat.org/oclc/782013318>,
|
152
|
+
<http://www.worldcat.org/oclc/037671494>,
|
153
|
+
<http://www.worldcat.org/oclc/751661734>,
|
154
|
+
<http://www.worldcat.org/oclc/800600611> .
|
155
|
+
|
156
|
+
- In addition, when the option to resolve OCLC works is enabled (OCLC_AUTH2WORKS option), the
|
157
|
+
following triples were added to those above.
|
158
|
+
|
159
|
+
<http://www.worldcat.org/oclc/004933024> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/796991413> .
|
160
|
+
<http://www.worldcat.org/oclc/006626542> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/111527266> .
|
161
|
+
<http://www.worldcat.org/oclc/007170722> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/144285064> .
|
162
|
+
<http://www.worldcat.org/oclc/013525674> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/7358848> .
|
163
|
+
<http://www.worldcat.org/oclc/013525712> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/7360091> .
|
164
|
+
<http://www.worldcat.org/oclc/013700764> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366036025> .
|
165
|
+
<http://www.worldcat.org/oclc/013700768> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366036042> .
|
166
|
+
<http://www.worldcat.org/oclc/017649403> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/866252320> .
|
167
|
+
<http://www.worldcat.org/oclc/018292079> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/836712068> .
|
168
|
+
<http://www.worldcat.org/oclc/018380395> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/365996343> .
|
169
|
+
<http://www.worldcat.org/oclc/023969230> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/890420837> .
|
170
|
+
<http://www.worldcat.org/oclc/035911289> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/355875201> .
|
171
|
+
<http://www.worldcat.org/oclc/036387153> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/622568> .
|
172
|
+
<http://www.worldcat.org/oclc/037671494> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/9216290> .
|
173
|
+
<http://www.worldcat.org/oclc/050868185> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/366714531> .
|
174
|
+
<http://www.worldcat.org/oclc/495781917> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
175
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/994448191> .
|
176
|
+
<http://www.worldcat.org/oclc/727657045> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
177
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/1811109792> .
|
178
|
+
<http://www.worldcat.org/oclc/747413718> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/994448191> .
|
179
|
+
<http://www.worldcat.org/oclc/751661734> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
180
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/1816359357> .
|
181
|
+
<http://www.worldcat.org/oclc/782013318> schema:contributor <http://www.worldcat.org/identities/lccn-n79044798>;
|
182
|
+
schema:exampleOfWork <http://www.worldcat.org/entity/work/id/146829946> .
|
183
|
+
<http://www.worldcat.org/oclc/889440750> schema:exampleOfWork <http://www.worldcat.org/entity/work/id/2061462527> .
|
80
184
|
|
81
185
|
|
82
186
|
Ruby Library Use
|
@@ -93,7 +197,8 @@ Ruby Library Use
|
|
93
197
|
record = MARC::Reader.decode(raw)
|
94
198
|
auth = ParseMarcAuthority.new(record)
|
95
199
|
auth_id = "auth:#{auth.get_id}"
|
96
|
-
|
200
|
+
graph = auth.graph
|
201
|
+
puts graph.to_ttl
|
97
202
|
end
|
98
203
|
end
|
99
204
|
|
@@ -105,7 +210,12 @@ Development
|
|
105
210
|
./bin/test.sh
|
106
211
|
cp .env_example .env # and edit .env
|
107
212
|
# develop code and/or bin scripts; run bin scripts, e.g.
|
108
|
-
.binstubs/marcAuthority2LD auth.
|
213
|
+
.binstubs/marcAuthority2LD auth.mrc
|
214
|
+
# Look for results in auth_turtle/*.ttl files.
|
215
|
+
# see also full example script in
|
216
|
+
#.binstubs/run_test_data.sh
|
217
|
+
# which includes shell script for basic stats and
|
218
|
+
# using rapper to check the file output syntax.
|
109
219
|
|
110
220
|
|
111
221
|
# License
|
data/bin/marcAuthority2LD
CHANGED
@@ -4,6 +4,7 @@ module Marc2LinkedData
|
|
4
4
|
class Configuration
|
5
5
|
|
6
6
|
attr_accessor :debug
|
7
|
+
attr_accessor :test_records
|
7
8
|
|
8
9
|
attr_accessor :threads
|
9
10
|
attr_accessor :thread_limit
|
@@ -38,6 +39,8 @@ module Marc2LinkedData
|
|
38
39
|
|
39
40
|
def initialize
|
40
41
|
@debug = env_boolean('DEBUG')
|
42
|
+
@test_records = ENV['TEST_RECORDS'].to_i
|
43
|
+
|
41
44
|
@threads = env_boolean('THREADS')
|
42
45
|
@thread_limit = ENV['THREAD_LIMIT'].to_i || 25
|
43
46
|
|
@@ -95,20 +95,7 @@ module Marc2LinkedData
|
|
95
95
|
unless loc_iri.nil?
|
96
96
|
# Verify the URL (used HEAD so it's as fast as possible)
|
97
97
|
@@config.logger.debug "Trying to validate LOC IRI: #{loc_iri}"
|
98
|
-
|
99
|
-
case res.code
|
100
|
-
when '200'
|
101
|
-
# it's good to go
|
102
|
-
when '301'
|
103
|
-
# use the redirection
|
104
|
-
loc_iri = res['location']
|
105
|
-
when '302','303'
|
106
|
-
#302 Moved Temporarily
|
107
|
-
#303 See Other
|
108
|
-
# Use the current URL, most get requests will follow a 302 or 303
|
109
|
-
else
|
110
|
-
loc_iri = nil
|
111
|
-
end
|
98
|
+
loc_iri = Marc2LinkedData.http_head_request(loc_iri + '.rdf')
|
112
99
|
end
|
113
100
|
if loc_iri.nil?
|
114
101
|
# If it gets here, it's a problem.
|
@@ -84,34 +84,19 @@ module Marc2LinkedData
|
|
84
84
|
|
85
85
|
def resolve_external_auth(url)
|
86
86
|
begin
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
#301 Moved Permanently
|
94
|
-
url = res['location']
|
95
|
-
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
96
|
-
return url
|
97
|
-
when '302','303'
|
98
|
-
#302 Moved Temporarily
|
99
|
-
#303 See Other
|
100
|
-
# Use the current URL, most get requests will follow a 302 or 303
|
101
|
-
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
102
|
-
return url
|
103
|
-
when '404'
|
104
|
-
@@config.logger.warn "#{@iri}\t// #{url}"
|
105
|
-
return nil
|
106
|
-
else
|
107
|
-
# WTF
|
108
|
-
binding.pry if @@config.debug
|
109
|
-
@@config.logger.error "unknown http response code (#{res.code}) for #{@iri}"
|
110
|
-
return nil
|
87
|
+
# RestClient does all the response code handling and redirection.
|
88
|
+
url = Marc2LinkedData.http_head_request(url)
|
89
|
+
if url.nil?
|
90
|
+
@@config.logger.warn "#{@iri}\t// #{url}"
|
91
|
+
else
|
92
|
+
@@config.logger.debug "Mapped #{@iri}\t-> #{url}"
|
111
93
|
end
|
112
94
|
rescue
|
113
|
-
|
95
|
+
binding.pry if @@config.debug
|
96
|
+
@@config.logger.error "unknown http error for #{@iri}"
|
97
|
+
url = nil
|
114
98
|
end
|
99
|
+
url
|
115
100
|
end
|
116
101
|
|
117
102
|
def same_as
|
data/lib/marc2linkeddata.rb
CHANGED
@@ -23,23 +23,20 @@ module Marc2LinkedData
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def self.http_head_request(url)
|
26
|
-
uri =
|
26
|
+
uri = nil
|
27
27
|
begin
|
28
|
-
|
29
|
-
|
30
|
-
else
|
31
|
-
req = Net::HTTP::Head.new(uri)
|
32
|
-
end
|
33
|
-
Net::HTTP.start(uri.host, uri.port) {|http| http.request req }
|
28
|
+
response = RestClient.head(url)
|
29
|
+
uri = response.args[:url]
|
34
30
|
rescue
|
35
|
-
@configuration.logger.error "
|
31
|
+
@configuration.logger.error "RestClient.head failed for #{url}"
|
36
32
|
begin
|
37
|
-
|
33
|
+
response = RestClient.get(url)
|
34
|
+
uri = response.args[:url]
|
38
35
|
rescue
|
39
|
-
@configuration.logger.error "
|
40
|
-
nil
|
36
|
+
@configuration.logger.error "RestClient.get failed for #{url}"
|
41
37
|
end
|
42
38
|
end
|
39
|
+
uri
|
43
40
|
end
|
44
41
|
|
45
42
|
def self.write_prefixes(file)
|
data/marc2linkeddata.gemspec
CHANGED