marc2linkeddata 0.0.7 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f54e31c819d0978629746c13c88690d5b2f1f392
4
- data.tar.gz: 5325910c217a5e1bd3d45bb0a1f0bdb3b48fa11e
3
+ metadata.gz: f9539a926fe1d42b3827b55c4e44245859f0c1e2
4
+ data.tar.gz: 048690d375535f681bf518af81639d6b6dca4af4
5
5
  SHA512:
6
- metadata.gz: 2565a8648875bc22f3b62ccd7cd1aad47ce843d388a93f6e4e363f7bab77644abbf56a1a70f73de9d6ac9aa9be3857578b62c87e5ebc2d4e50b157afaea7c7ae
7
- data.tar.gz: a8ba98f07795a0c97f4bd00d9c431ea7a4dd15d39952f1cdbc56a4136d4b6e60f2bb0a6d05cb6f3affe5d5eecd03dc6752b4093dcb8b149f61d36583911af610
6
+ metadata.gz: 7d42f2629e02882f94e60ceb1fe8503766d88c2cd108b0e57fb87d8704663f6a693667d098832d4c793e5e3ff285b04b3541d297d989529e63782dfbae5289a8
7
+ data.tar.gz: fa4fe996e40a4de2cb0bffab698dc76c843612ab59055c498b0a99af4f6d50a05156c0e715bcbfd074643e04ff0578687e555e3753eeb6990417f2f246ed11ab
data/.env_example CHANGED
@@ -1,46 +1,71 @@
1
1
  # https://github.com/bkeepers/dotenv is used for
2
2
  # default configuration options. The values in
3
3
  # this file do not replace existing values in
4
- # the shell ENV.
4
+ # the shell ENV, but these settings will be in
5
+ # effect when the shell ENV doesn't contain them.
6
+ # To add these settings to the shell ENV, use
7
+ # $ source .env
8
+ # To override individual settings in this file, just
9
+ # set them in the shell ENV before running a script.
5
10
 
6
11
  # Uncomment and set values as required. See used settings in
7
12
  # lib/marc2linkeddata/configuration.rb
8
13
 
9
- DEBUG: false
14
+ export DEBUG=false
15
+
16
+ export LOG_FILE='marc2ld.log'
17
+ export LIB_PREFIX=http://linked-data.example.org/library/
18
+
19
+ # Without any options enabled to GET_* via HTTP, the process
20
+ # is largely file IO bound, rather than CPU bound. In this
21
+ # case, threading can decrease performance. However, when any
22
+ # GET_* options are enabled for retrieval of RDF over HTTP,
23
+ # threading may improve performance.
24
+ export THREADS=true
25
+ export THREAD_LIMIT=25
10
26
 
11
27
  # Authority record field numbers for useful link data
12
- FIELD_AUTH_LOC: 920
13
- FIELD_AUTH_VIAF: 921
14
- FIELD_AUTH_ISNI: 922
15
- FIELD_AUTH_OCLC: 035
28
+ export FIELD_AUTH_LOC=920
29
+ export FIELD_AUTH_VIAF=921
30
+ export FIELD_AUTH_ISNI=922
31
+ export FIELD_AUTH_OCLC=035
16
32
 
17
33
  # Options for retrieving linked data to resolve and enhance data.
18
- # Set all false for the quickest translation.
19
- GET_ISNI: false
20
- GET_LOC: true # currently required, should be optional
21
- GET_OCLC: false
22
- GET_VIAF: false
34
+ # Set all false for the quickest translation. Note that having an
35
+ # LOC identifier (MARC record number may be sufficient), is usually
36
+ # a prelude to getting additional linked data. If any of these are
37
+ # enabled, it may require GET_LOC=true to have greater success.
38
+ export GET_LOC=false
39
+ export GET_ISNI=false
40
+ export GET_OCLC=false
41
+ export GET_VIAF=false
23
42
 
24
43
  # Using OCLC identity, retrieve RDF for creative works?
25
44
  # Only works when GET_OCLC==true; it can slow processing significantly.
26
- OCLC_AUTH2WORKS: false
27
-
28
- LOG_FILE: 'marc2ld.log'
29
-
30
- LIB_PREFIX: http://linked-data.example.org/library/
45
+ export OCLC_AUTH2WORKS=false
31
46
 
32
47
  # Use FOAF or SCHEMA or both
33
- USE_FOAF: false
34
- USE_SCHEMA: true
48
+ export USE_FOAF=false
49
+ export USE_SCHEMA=true
35
50
 
36
51
  # Local triple store for LOC authority data,
37
52
  # accessed via an HTTP API with basic authentication.
38
53
  # See downloads at http://id.loc.gov/download/
39
- LOCAL_LOC_USER: 'sparqlUser'
40
- LOCAL_LOC_PASS: 'sparqlPass'
41
- LOCAL_LOC_HOST: 'dev-sparql.example.org'
42
- LOCAL_LOC_PORT: '80'
43
- LOCAL_LOC_PATH: '/sparql?'
54
+ export LOCAL_LOC_USER='sparqlUser'
55
+ export LOCAL_LOC_PASS='sparqlPass'
56
+ export LOCAL_LOC_HOST='dev-sparql.example.org'
57
+ export LOCAL_LOC_PORT='80'
58
+ export LOCAL_LOC_PATH='/sparql?'
59
+
60
+ # SUL-CAP resources
61
+ # May require ssh port forwarding, e.g.:
62
+ # ssh ${USER}@cap-mysql-host.example.com -L 3308:localhost:3306 -N &
63
+ export SUL_CAP_ENABLED=false
64
+ export SUL_CAP_DB_HOST=localhost
65
+ export SUL_CAP_DB_PORT=3306
66
+ export SUL_CAP_DB_USER=capUser
67
+ export SUL_CAP_DB_PASSWORD=capPass
68
+ export SUL_CAP_DB_DATABASE=cap
44
69
 
45
70
  # Redis Persistence - based on https://github.com/redis/redis-rb
46
71
  # - essential options:
@@ -52,11 +77,11 @@ LOCAL_LOC_PATH: '/sparql?'
52
77
  # # faster reading of triples from pre-populated redis data
53
78
  # export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
54
79
  # # current data is updated in redis
55
- REDIS4MARC: false
80
+ export REDIS4MARC=false
56
81
  # Uncomment these options to disable read or write (independently)
57
- #REDIS_READ: false
58
- #REDIS_WRITE: false
82
+ #export REDIS_READ=false
83
+ #export REDIS_WRITE=false
59
84
  # Leave commented to use default redis configs on localhost
60
- #REDIS_URL: localhost
85
+ #export REDIS_URL=localhost
61
86
 
62
87
 
data/README.md CHANGED
@@ -3,13 +3,44 @@ marc2linkeddata
3
3
  ===============
4
4
 
5
5
  Utilities for translating MARC21 into linked data. The project has
6
- focused on authority records (as of Feb, 2015).
6
+ focused on authority records (as of 2015).
7
+
8
+ It has config options that can be enabled to increase the amount of data retrieved.
9
+ Without any HTTP options enabled, using only data in the MARC record, it can
10
+ translate 100,000 authority records in about 5-6 min on a current laptop system.
11
+ File IO is the most expensive operation in this mode, so it helps to have a solid
12
+ state drive or something with high IO performance.
13
+
14
+ The current output is to the file system, but it should be easy to incorporate
15
+ and configure alternatives by using the RDF.rb facilities for connecting to a
16
+ repository. A minor attempt was explored to use redis for caching, but that
17
+ exploration hasn't matured much, mainly because there is no 'cache-expiry' data
18
+ yet and because it would be better to use an RDF.rb extension of some
19
+ kind (for redis, mongodb, etc) or to use a triple store/solr platform.
20
+
21
+ With HTTP/RDF retrieval options enabled, it can take a lot longer (days) and the
22
+ providers may not be very happy about a barrage of requests.
23
+
24
+ Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
25
+ are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
26
+ language support (such as Array#delete_if).
27
+
28
+ TODO: A significant problem to solve is effective caching or mirrors for linked data.
29
+ The retrieval should inspect any HTTP cache headers that might be available and
30
+ adding PROVO to the linked-data graph generated for each record.
31
+
32
+ TODO: Provide system platform options, to dockerize the application and make it easier
33
+ for automatic horizontal scaling. Consider https://www.packer.io/intro/index.html
7
34
 
8
35
  Optional Dependencies
9
36
 
10
- - http://redis.io/
11
37
  - http://4store.org/
38
+ - http://www.mongodb.org/
39
+ - http://redis.io/
12
40
  - see notes below
41
+ - see also:
42
+ - http://marmotta.apache.org
43
+ - http://stardog.com
13
44
 
14
45
  Install
15
46
 
data/bin/marcAuthority2LD CHANGED
@@ -4,6 +4,17 @@ require 'marc2linkeddata'
4
4
 
5
5
  CONFIG = Marc2LinkedData.configuration
6
6
 
7
+ def stack_trace(e, record)
8
+ $stderr.write "\n"
9
+ $stderr.write "ERROR\n"
10
+ $stderr.write e.message
11
+ $stderr.write e.backtrace
12
+ $stderr.write "\n"
13
+ $stderr.write record.to_s
14
+ $stderr.write "\n"
15
+ end
16
+
17
+ # Count all the records in the MARC file.
7
18
  def marc_auth_count(marc_file)
8
19
  auth_records = 0
9
20
  until marc_file.eof?
@@ -12,11 +23,7 @@ def marc_auth_count(marc_file)
12
23
  marc_file.seek(leader[:length], IO::SEEK_CUR)
13
24
  auth_records += 1 if leader[:type] == 'z'
14
25
  rescue => e
15
- puts
16
- puts 'ERROR'
17
- puts e.message
18
- puts e.backtrace
19
- puts
26
+ stack_trace(e, record)
20
27
  binding.pry if CONFIG.debug
21
28
  end
22
29
  end
@@ -24,7 +31,76 @@ def marc_auth_count(marc_file)
24
31
  auth_records
25
32
  end
26
33
 
27
- def marc2ld(marc_filename)
34
+ # Memory intensive loading of all authority records in the MARC file.
35
+ def marc_authority_records(marc_filename)
36
+ puts "Reading records from: #{marc_filename}"
37
+ marc_file = File.open(marc_filename,'r')
38
+ auth_count = 0
39
+ auth_records = []
40
+ until marc_file.eof?
41
+ begin
42
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
43
+ raw = marc_file.read(leader[:length])
44
+ if leader[:type] == 'z'
45
+ record = MARC::Reader.decode(raw)
46
+ auth_records << record
47
+ auth_count += 1
48
+ $stdout.printf "\b\b\b\b\b\b" if auth_count > 1
49
+ $stdout.printf '%06d', auth_count
50
+ end
51
+ rescue => e
52
+ stack_trace(e, record)
53
+ binding.pry if CONFIG.debug
54
+ end
55
+ end
56
+ marc_file.close
57
+ $stdout.write "\n"
58
+ auth_records
59
+ end
60
+
61
+
62
+ def auth_record_cache(auth)
63
+
64
+ # auth_id = "auth:#{auth.get_id}"
65
+ # triples = nil
66
+ # # TODO: enable additional persistence options
67
+ # # Use data already in redis (if enabled)
68
+ # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
69
+ # if triples.nil?
70
+ # triples = auth.to_ttl # generate new triples
71
+ # # Update redis (if enabled) for triples not read from redis
72
+ # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
73
+ # end
74
+
75
+ end
76
+
77
+
78
+
79
+ def marc_record2turtle(record, output_path=nil)
80
+ begin
81
+ # ParseMarcAuthority is a lazy parser, so
82
+ # init only assigns record to an instance var.
83
+ auth = Marc2LinkedData::ParseMarcAuthority.new(record)
84
+ auth_record_cache(auth)
85
+ triples = auth.to_ttl.lines
86
+ binding.pry if (CONFIG.debug && triples.empty?)
87
+ triples.delete_if {|l| l.chomp.empty? }
88
+ # Output the triples to a turtle file.
89
+ ld_filename = File.join(output_path, "auth_#{auth.get_id}.ttl")
90
+ CONFIG.logger.info "Writing triples in turtle to #{ld_filename}"
91
+ ld_file = File.open(ld_filename,'w')
92
+ ld_file.write(triples.join)
93
+ ld_file.flush
94
+ ld_file.close
95
+ rescue => e
96
+ stack_trace(e, record)
97
+ binding.pry if CONFIG.debug
98
+ end
99
+ end
100
+
101
+
102
+
103
+ def marc_file2ld(marc_filename)
28
104
  ld_filename = marc_filename.gsub('.mrc','.ttl')
29
105
  puts "Translating: #{marc_filename} to #{ld_filename}"
30
106
  ld_file = File.open(ld_filename,'w')
@@ -44,17 +120,7 @@ def marc2ld(marc_filename)
44
120
  # init only assigns record to an instance var.
45
121
  auth = Marc2LinkedData::ParseMarcAuthority.new(record)
46
122
  auth_count += 1
47
- # auth_id = "auth:#{auth.get_id}"
48
- # triples = nil
49
- # # TODO: enable additional persistence options
50
- # # Use data already in redis (if enabled)
51
- # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
52
- # if triples.nil?
53
- # triples = auth.to_ttl # generate new triples
54
- # # Update redis (if enabled) for triples not read from redis
55
- # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
56
- # end
57
-
123
+ auth_record_cache(auth)
58
124
  triples = auth.to_ttl.lines
59
125
  binding.pry if (CONFIG.debug && triples.empty?)
60
126
  triples.delete_if {|l| l.chomp.empty? }
@@ -63,12 +129,7 @@ def marc2ld(marc_filename)
63
129
  ld_file.flush
64
130
  end
65
131
  rescue => e
66
- puts
67
- puts 'ERROR'
68
- puts e.message
69
- puts e.backtrace
70
- puts record.to_s
71
- puts
132
+ stack_trace(e, record)
72
133
  binding.pry if CONFIG.debug
73
134
  end
74
135
  end
@@ -77,38 +138,68 @@ def marc2ld(marc_filename)
77
138
  ld_file.close
78
139
  end
79
140
 
141
+ def thread_wait(threads)
142
+ threads.each{|t| t.join}
143
+ threads.delete_if {|t| t.status == false}
144
+ threads.delete_if {|t| t.status.nil? }
145
+ end
146
+
147
+
148
+ # ---------------------------------------------------------------------
149
+ # MAIN
150
+
151
+
80
152
  marc_files = []
81
153
  ARGV.each do |filename|
82
154
  path = Pathname(filename)
83
155
  marc_files.push(path) if path.exist?
84
156
  end
85
157
  if marc_files.empty?
158
+ script_name = File.basename(__FILE__)
159
+ script_path = File.dirname(__FILE__)
160
+ example_env_file = File.absolute_path(File.join(script_path,'..','.env_example'))
86
161
  puts <<HELP
87
- #{__FILE__} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
88
-
89
- Output is RDF triples in a turtle file (.ttl) for every input .mrc file.
90
- Optional persistence services can be controlled by environment variables.
91
-
92
- Redis Persistence - based on https://github.com/redis/redis-rb
93
- - essential options:
94
- export REDIS4MARC=true # enable redis persistence (default = false)
95
- - supplementary options:
96
- Set the REDIS_URL for a custom redis configuration.
97
- export REDIS_URL="redis://{user}:{password}@{host}:{port}/{db}"
98
- export REDIS_READ=true # enable redis reads (default = REDIS4MARC || false)
99
- # faster reading of triples from pre-populated redis data
100
- export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
101
- # recent data is updated in redis
162
+ #{script_name} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
163
+
164
+ Output is RDF triples in turtle files (.ttl). The files are output into a
165
+ directory created in the same path as the .mrc file,
166
+ with one .ttl file for each record.
167
+
168
+ Optional configuration can be set in environment variables. A '.env' file can be
169
+ created in the path where this utility is run and this utility will use it. See
170
+ comments and settings in the example file at:
171
+ #{example_env_file}
102
172
 
103
173
  HELP
104
174
  exit!
105
- else
106
175
  end
107
176
 
108
177
  puts "Logging to: #{CONFIG.log_file}"
109
178
  marc_files.each do |path|
110
179
  CONFIG.logger.info "Processing: #{path}"
111
- marc2ld(path.to_s)
180
+ # marc_file2ld(path.to_s)
181
+ output_dir = path.basename.to_s.gsub('.mrc','').gsub('.','_') + '_turtle'
182
+ output_path = File.join(path.dirname.to_s, output_dir)
183
+ Dir.mkdir(output_path, 0775) unless File.directory? output_path
184
+ auth_records = marc_authority_records(path.to_s)
185
+ progress = ProgressBar.create(:total => auth_records.length, :format => '%a %f |%b>>%i| %P%% %t')
186
+ if CONFIG.threads
187
+ threads = []
188
+ auth_records.each do |r|
189
+ thread_wait(threads) while threads.length >= CONFIG.thread_limit
190
+ t = Thread.new { marc_record2turtle(r, output_path) }
191
+ t.abort_on_exception = true
192
+ threads << t
193
+ progress.increment # increment progress although thread is still running
194
+ end
195
+ # Ensure any remaining threads complete
196
+ threads.each{|t| t.join}
197
+ else
198
+ auth_records.each do |r|
199
+ marc_record2turtle(r, output_path)
200
+ progress.increment
201
+ end
202
+ end
112
203
  end
113
204
 
114
205
 
@@ -0,0 +1,68 @@
1
+ #!/bin/bash
2
+
3
+ export DEBUG=false
4
+
5
+ # Runs a lot slower on jruby, even with threading enabled.
6
+ export JRUBY_OPTS=-J-Xmx2g
7
+
8
+ export THREAD=false
9
+ export GET_LOC=false # if this is true, be prepared to wait a very long time!
10
+
11
+ export LOG_FILE='./log/marc2ld.log'
12
+ export LIB_PREFIX='http://www.linked-data.org/library/'
13
+
14
+ # Additional config options should be in .env;
15
+ # the .env values will not replace those above.
16
+ if [ ! -s .env ]; then
17
+ cp -u .env_example .env
18
+ fi
19
+
20
+ SCRIPT_FILE='.binstubs/marcAuthority2LD'
21
+ if [ ! -s ${SCRIPT_FILE} ]; then
22
+ echo "Cannot locate script: $SCRIPT_FILE"
23
+ exit 1
24
+ fi
25
+
26
+ AUTH_FILE="./data/auth.mrc"
27
+ AUTH_PATH="./data/auth_turtle/"
28
+ if [ ! -s ${AUTH_FILE} ]; then
29
+ echo "Place a MARC21 authority file into: $AUTH_FILE"
30
+ exit 1
31
+ fi
32
+
33
+ ${SCRIPT_FILE} ${AUTH_FILE}
34
+
35
+ DATA_LOG_FILE="./log/run_test_data.log"
36
+ echo -e "\n\n" > ${DATA_LOG_FILE}
37
+
38
+ echo -e "Output file count should be 100000:" >> ${DATA_LOG_FILE}
39
+ find ${AUTH_PATH} -type f | wc -l >> ${DATA_LOG_FILE}
40
+ echo -e "\n\n" > ${DATA_LOG_FILE}
41
+
42
+ # count all the different types of authority files
43
+ echo -e "Different types of authority records:\n" >> ${DATA_LOG_FILE}
44
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | sed -e 's/^.*> a/a/' | sort -u >> ${DATA_LOG_FILE}
45
+ echo -e "\n\n" > ${DATA_LOG_FILE}
46
+
47
+ echo -e "Count for 'Person' authority records:" >> ${DATA_LOG_FILE}
48
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Person' >> ${DATA_LOG_FILE}
49
+ echo -e "Count for 'Organization' authority records:" >> ${DATA_LOG_FILE}
50
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Organization' >> ${DATA_LOG_FILE}
51
+ echo -e "Count for 'Place' authority records:" >> ${DATA_LOG_FILE}
52
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Place' >> ${DATA_LOG_FILE}
53
+ echo -e "Count for 'event' authority records:" >> ${DATA_LOG_FILE}
54
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'event' >> ${DATA_LOG_FILE}
55
+ echo -e "Count for 'v1#NameTitle' authority records:" >> ${DATA_LOG_FILE}
56
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'v1#NameTitle' >> ${DATA_LOG_FILE}
57
+ echo -e "Count for 'v1#Title' authority records:" >> ${DATA_LOG_FILE}
58
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'v1#Title' >> ${DATA_LOG_FILE}
59
+
60
+ # # check the syntax of the output files
61
+ # echo -e "\n\n\n" >> ${DATA_LOG_FILE}
62
+ # for f in $(find ${AUTH_PATH} -type f); do
63
+ # rapper -c -i turtle $f >> ${DATA_LOG_FILE} 2>&1
64
+ # done
65
+
66
+ # cleanup
67
+ rm -rf ${AUTH_PATH}
68
+