marc2linkeddata 0.0.7 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f54e31c819d0978629746c13c88690d5b2f1f392
4
- data.tar.gz: 5325910c217a5e1bd3d45bb0a1f0bdb3b48fa11e
3
+ metadata.gz: f9539a926fe1d42b3827b55c4e44245859f0c1e2
4
+ data.tar.gz: 048690d375535f681bf518af81639d6b6dca4af4
5
5
  SHA512:
6
- metadata.gz: 2565a8648875bc22f3b62ccd7cd1aad47ce843d388a93f6e4e363f7bab77644abbf56a1a70f73de9d6ac9aa9be3857578b62c87e5ebc2d4e50b157afaea7c7ae
7
- data.tar.gz: a8ba98f07795a0c97f4bd00d9c431ea7a4dd15d39952f1cdbc56a4136d4b6e60f2bb0a6d05cb6f3affe5d5eecd03dc6752b4093dcb8b149f61d36583911af610
6
+ metadata.gz: 7d42f2629e02882f94e60ceb1fe8503766d88c2cd108b0e57fb87d8704663f6a693667d098832d4c793e5e3ff285b04b3541d297d989529e63782dfbae5289a8
7
+ data.tar.gz: fa4fe996e40a4de2cb0bffab698dc76c843612ab59055c498b0a99af4f6d50a05156c0e715bcbfd074643e04ff0578687e555e3753eeb6990417f2f246ed11ab
data/.env_example CHANGED
@@ -1,46 +1,71 @@
1
1
  # https://github.com/bkeepers/dotenv is used for
2
2
  # default configuration options. The values in
3
3
  # this file do not replace existing values in
4
- # the shell ENV.
4
+ # the shell ENV, but these settings will be in
5
+ # effect when the shell ENV doesn't contain them.
6
+ # To add these settings to the shell ENV, use
7
+ # $ source .env
8
+ # To override individual settings in this file, just
9
+ # set them in the shell ENV before running a script.
5
10
 
6
11
  # Uncomment and set values as required. See used settings in
7
12
  # lib/marc2linkeddata/configuration.rb
8
13
 
9
- DEBUG: false
14
+ export DEBUG=false
15
+
16
+ export LOG_FILE='marc2ld.log'
17
+ export LIB_PREFIX=http://linked-data.example.org/library/
18
+
19
+ # Without any options enabled to GET_* via HTTP, the process
20
+ # is largely file IO bound, rather than CPU bound. In this
21
+ # case, threading can decrease performance. However, when any
22
+ # GET_* options are enabled for retrieval of RDF over HTTP,
23
+ # threading may improve performance.
24
+ export THREADS=true
25
+ export THREAD_LIMIT=25
10
26
 
11
27
  # Authority record field numbers for useful link data
12
- FIELD_AUTH_LOC: 920
13
- FIELD_AUTH_VIAF: 921
14
- FIELD_AUTH_ISNI: 922
15
- FIELD_AUTH_OCLC: 035
28
+ export FIELD_AUTH_LOC=920
29
+ export FIELD_AUTH_VIAF=921
30
+ export FIELD_AUTH_ISNI=922
31
+ export FIELD_AUTH_OCLC=035
16
32
 
17
33
  # Options for retrieving linked data to resolve and enhance data.
18
- # Set all false for the quickest translation.
19
- GET_ISNI: false
20
- GET_LOC: true # currently required, should be optional
21
- GET_OCLC: false
22
- GET_VIAF: false
34
+ # Set all false for the quickest translation. Note that having an
35
+ # LOC identifier (MARC record number may be sufficient), is usually
36
+ # a prelude to getting additional linked data. If any of these are
37
+ # enabled, it may require GET_LOC=true to have greater success.
38
+ export GET_LOC=false
39
+ export GET_ISNI=false
40
+ export GET_OCLC=false
41
+ export GET_VIAF=false
23
42
 
24
43
  # Using OCLC identity, retrieve RDF for creative works?
25
44
  # Only works when GET_OCLC==true; it can slow processing significantly.
26
- OCLC_AUTH2WORKS: false
27
-
28
- LOG_FILE: 'marc2ld.log'
29
-
30
- LIB_PREFIX: http://linked-data.example.org/library/
45
+ export OCLC_AUTH2WORKS=false
31
46
 
32
47
  # Use FOAF or SCHEMA or both
33
- USE_FOAF: false
34
- USE_SCHEMA: true
48
+ export USE_FOAF=false
49
+ export USE_SCHEMA=true
35
50
 
36
51
  # Local triple store for LOC authority data,
37
52
  # accessed via an HTTP API with basic authentication.
38
53
  # See downloads at http://id.loc.gov/download/
39
- LOCAL_LOC_USER: 'sparqlUser'
40
- LOCAL_LOC_PASS: 'sparqlPass'
41
- LOCAL_LOC_HOST: 'dev-sparql.example.org'
42
- LOCAL_LOC_PORT: '80'
43
- LOCAL_LOC_PATH: '/sparql?'
54
+ export LOCAL_LOC_USER='sparqlUser'
55
+ export LOCAL_LOC_PASS='sparqlPass'
56
+ export LOCAL_LOC_HOST='dev-sparql.example.org'
57
+ export LOCAL_LOC_PORT='80'
58
+ export LOCAL_LOC_PATH='/sparql?'
59
+
60
+ # SUL-CAP resources
61
+ # May require ssh port forwarding, e.g.:
62
+ # ssh ${USER}@cap-mysql-host.example.com -L 3308:localhost:3306 -N &
63
+ export SUL_CAP_ENABLED=false
64
+ export SUL_CAP_DB_HOST=localhost
65
+ export SUL_CAP_DB_PORT=3306
66
+ export SUL_CAP_DB_USER=capUser
67
+ export SUL_CAP_DB_PASSWORD=capPass
68
+ export SUL_CAP_DB_DATABASE=cap
44
69
 
45
70
  # Redis Persistence - based on https://github.com/redis/redis-rb
46
71
  # - essential options:
@@ -52,11 +77,11 @@ LOCAL_LOC_PATH: '/sparql?'
52
77
  # # faster reading of triples from pre-populated redis data
53
78
  # export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
54
79
  # # current data is updated in redis
55
- REDIS4MARC: false
80
+ export REDIS4MARC=false
56
81
  # Uncomment these options to disable read or write (independently)
57
- #REDIS_READ: false
58
- #REDIS_WRITE: false
82
+ #export REDIS_READ=false
83
+ #export REDIS_WRITE=false
59
84
  # Leave commented to use default redis configs on localhost
60
- #REDIS_URL: localhost
85
+ #export REDIS_URL=localhost
61
86
 
62
87
 
data/README.md CHANGED
@@ -3,13 +3,44 @@ marc2linkeddata
3
3
  ===============
4
4
 
5
5
  Utilities for translating MARC21 into linked data. The project has
6
- focused on authority records (as of Feb, 2015).
6
+ focused on authority records (as of 2015).
7
+
8
+ It has config options that can be enabled to increase the amount of data retrieved.
9
+ Without any HTTP options enabled, using only data in the MARC record, it can
10
+ translate 100,000 authority records in about 5-6 min on a current laptop system.
11
+ File IO is the most expensive operation in this mode, so it helps to have a solid
12
+ state drive or something with high IO performance.
13
+
14
+ The current output is to the file system, but it should be easy to incorporate
15
+ and configure alternatives by using the RDF.rb facilities for connecting to a
16
+ repository. A minor attempt was explored to use redis for caching, but that
17
+ exploration hasn't matured much, mainly because there is no 'cache-expiry' data
18
+ yet and because it would be better to use an RDF.rb extension of some
19
+ kind (for redis, mongodb, etc) or to use a triple store/solr platform.
20
+
21
+ With HTTP/RDF retrieval options enabled, it can take a lot longer (days) and the
22
+ providers may not be very happy about a barrage of requests.
23
+
24
+ Note that it runs a lot slower on jruby-9.0.0.0-pre1 than MRI 2.2.0, whether threads
25
+ are enabled or not. It raises exceptions on jruby-1.7.9, related to ruby
26
+ language support (such as Array#delete_if).
27
+
28
+ TODO: A significant problem to solve is effective caching or mirrors for linked data.
29
+ The retrieval should inspect any HTTP cache headers that might be available and
30
+ adding PROVO to the linked-data graph generated for each record.
31
+
32
+ TODO: Provide system platform options, to dockerize the application and make it easier
33
+ for automatic horizontal scaling. Consider https://www.packer.io/intro/index.html
7
34
 
8
35
  Optional Dependencies
9
36
 
10
- - http://redis.io/
11
37
  - http://4store.org/
38
+ - http://www.mongodb.org/
39
+ - http://redis.io/
12
40
  - see notes below
41
+ - see also:
42
+ - http://marmotta.apache.org
43
+ - http://stardog.com
13
44
 
14
45
  Install
15
46
 
data/bin/marcAuthority2LD CHANGED
@@ -4,6 +4,17 @@ require 'marc2linkeddata'
4
4
 
5
5
  CONFIG = Marc2LinkedData.configuration
6
6
 
7
+ def stack_trace(e, record)
8
+ $stderr.write "\n"
9
+ $stderr.write "ERROR\n"
10
+ $stderr.write e.message
11
+ $stderr.write e.backtrace
12
+ $stderr.write "\n"
13
+ $stderr.write record.to_s
14
+ $stderr.write "\n"
15
+ end
16
+
17
+ # Count all the records in the MARC file.
7
18
  def marc_auth_count(marc_file)
8
19
  auth_records = 0
9
20
  until marc_file.eof?
@@ -12,11 +23,7 @@ def marc_auth_count(marc_file)
12
23
  marc_file.seek(leader[:length], IO::SEEK_CUR)
13
24
  auth_records += 1 if leader[:type] == 'z'
14
25
  rescue => e
15
- puts
16
- puts 'ERROR'
17
- puts e.message
18
- puts e.backtrace
19
- puts
26
+ stack_trace(e, record)
20
27
  binding.pry if CONFIG.debug
21
28
  end
22
29
  end
@@ -24,7 +31,76 @@ def marc_auth_count(marc_file)
24
31
  auth_records
25
32
  end
26
33
 
27
- def marc2ld(marc_filename)
34
+ # Memory intensive loading of all authority records in the MARC file.
35
+ def marc_authority_records(marc_filename)
36
+ puts "Reading records from: #{marc_filename}"
37
+ marc_file = File.open(marc_filename,'r')
38
+ auth_count = 0
39
+ auth_records = []
40
+ until marc_file.eof?
41
+ begin
42
+ leader = Marc2LinkedData::ParseMarcAuthority::parse_leader(marc_file)
43
+ raw = marc_file.read(leader[:length])
44
+ if leader[:type] == 'z'
45
+ record = MARC::Reader.decode(raw)
46
+ auth_records << record
47
+ auth_count += 1
48
+ $stdout.printf "\b\b\b\b\b\b" if auth_count > 1
49
+ $stdout.printf '%06d', auth_count
50
+ end
51
+ rescue => e
52
+ stack_trace(e, record)
53
+ binding.pry if CONFIG.debug
54
+ end
55
+ end
56
+ marc_file.close
57
+ $stdout.write "\n"
58
+ auth_records
59
+ end
60
+
61
+
62
+ def auth_record_cache(auth)
63
+
64
+ # auth_id = "auth:#{auth.get_id}"
65
+ # triples = nil
66
+ # # TODO: enable additional persistence options
67
+ # # Use data already in redis (if enabled)
68
+ # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
69
+ # if triples.nil?
70
+ # triples = auth.to_ttl # generate new triples
71
+ # # Update redis (if enabled) for triples not read from redis
72
+ # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
73
+ # end
74
+
75
+ end
76
+
77
+
78
+
79
+ def marc_record2turtle(record, output_path=nil)
80
+ begin
81
+ # ParseMarcAuthority is a lazy parser, so
82
+ # init only assigns record to an instance var.
83
+ auth = Marc2LinkedData::ParseMarcAuthority.new(record)
84
+ auth_record_cache(auth)
85
+ triples = auth.to_ttl.lines
86
+ binding.pry if (CONFIG.debug && triples.empty?)
87
+ triples.delete_if {|l| l.chomp.empty? }
88
+ # Output the triples to a turtle file.
89
+ ld_filename = File.join(output_path, "auth_#{auth.get_id}.ttl")
90
+ CONFIG.logger.info "Writing triples in turtle to #{ld_filename}"
91
+ ld_file = File.open(ld_filename,'w')
92
+ ld_file.write(triples.join)
93
+ ld_file.flush
94
+ ld_file.close
95
+ rescue => e
96
+ stack_trace(e, record)
97
+ binding.pry if CONFIG.debug
98
+ end
99
+ end
100
+
101
+
102
+
103
+ def marc_file2ld(marc_filename)
28
104
  ld_filename = marc_filename.gsub('.mrc','.ttl')
29
105
  puts "Translating: #{marc_filename} to #{ld_filename}"
30
106
  ld_file = File.open(ld_filename,'w')
@@ -44,17 +120,7 @@ def marc2ld(marc_filename)
44
120
  # init only assigns record to an instance var.
45
121
  auth = Marc2LinkedData::ParseMarcAuthority.new(record)
46
122
  auth_count += 1
47
- # auth_id = "auth:#{auth.get_id}"
48
- # triples = nil
49
- # # TODO: enable additional persistence options
50
- # # Use data already in redis (if enabled)
51
- # triples = CONFIG.redis.get(auth_id) if CONFIG.redis_read
52
- # if triples.nil?
53
- # triples = auth.to_ttl # generate new triples
54
- # # Update redis (if enabled) for triples not read from redis
55
- # CONFIG.redis.set(auth_id, triples) if CONFIG.redis_write
56
- # end
57
-
123
+ auth_record_cache(auth)
58
124
  triples = auth.to_ttl.lines
59
125
  binding.pry if (CONFIG.debug && triples.empty?)
60
126
  triples.delete_if {|l| l.chomp.empty? }
@@ -63,12 +129,7 @@ def marc2ld(marc_filename)
63
129
  ld_file.flush
64
130
  end
65
131
  rescue => e
66
- puts
67
- puts 'ERROR'
68
- puts e.message
69
- puts e.backtrace
70
- puts record.to_s
71
- puts
132
+ stack_trace(e, record)
72
133
  binding.pry if CONFIG.debug
73
134
  end
74
135
  end
@@ -77,38 +138,68 @@ def marc2ld(marc_filename)
77
138
  ld_file.close
78
139
  end
79
140
 
141
+ def thread_wait(threads)
142
+ threads.each{|t| t.join}
143
+ threads.delete_if {|t| t.status == false}
144
+ threads.delete_if {|t| t.status.nil? }
145
+ end
146
+
147
+
148
+ # ---------------------------------------------------------------------
149
+ # MAIN
150
+
151
+
80
152
  marc_files = []
81
153
  ARGV.each do |filename|
82
154
  path = Pathname(filename)
83
155
  marc_files.push(path) if path.exist?
84
156
  end
85
157
  if marc_files.empty?
158
+ script_name = File.basename(__FILE__)
159
+ script_path = File.dirname(__FILE__)
160
+ example_env_file = File.absolute_path(File.join(script_path,'..','.env_example'))
86
161
  puts <<HELP
87
- #{__FILE__} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
88
-
89
- Output is RDF triples in a turtle file (.ttl) for every input .mrc file.
90
- Optional persistence services can be controlled by environment variables.
91
-
92
- Redis Persistence - based on https://github.com/redis/redis-rb
93
- - essential options:
94
- export REDIS4MARC=true # enable redis persistence (default = false)
95
- - supplementary options:
96
- Set the REDIS_URL for a custom redis configuration.
97
- export REDIS_URL="redis://{user}:{password}@{host}:{port}/{db}"
98
- export REDIS_READ=true # enable redis reads (default = REDIS4MARC || false)
99
- # faster reading of triples from pre-populated redis data
100
- export REDIS_WRITE=true # enable redis writes (default = REDIS4MARC || false)
101
- # recent data is updated in redis
162
+ #{script_name} marc_authority_file1.mrc [ marc_authority_file2.mrc .. marc_authority_fileN.mrc ]
163
+
164
+ Output is RDF triples in turtle files (.ttl). The files are output into a
165
+ directory created in the same path as the .mrc file,
166
+ with one .ttl file for each record.
167
+
168
+ Optional configuration can be set in environment variables. A '.env' file can be
169
+ created in the path where this utility is run and this utility will use it. See
170
+ comments and settings in the example file at:
171
+ #{example_env_file}
102
172
 
103
173
  HELP
104
174
  exit!
105
- else
106
175
  end
107
176
 
108
177
  puts "Logging to: #{CONFIG.log_file}"
109
178
  marc_files.each do |path|
110
179
  CONFIG.logger.info "Processing: #{path}"
111
- marc2ld(path.to_s)
180
+ # marc_file2ld(path.to_s)
181
+ output_dir = path.basename.to_s.gsub('.mrc','').gsub('.','_') + '_turtle'
182
+ output_path = File.join(path.dirname.to_s, output_dir)
183
+ Dir.mkdir(output_path, 0775) unless File.directory? output_path
184
+ auth_records = marc_authority_records(path.to_s)
185
+ progress = ProgressBar.create(:total => auth_records.length, :format => '%a %f |%b>>%i| %P%% %t')
186
+ if CONFIG.threads
187
+ threads = []
188
+ auth_records.each do |r|
189
+ thread_wait(threads) while threads.length >= CONFIG.thread_limit
190
+ t = Thread.new { marc_record2turtle(r, output_path) }
191
+ t.abort_on_exception = true
192
+ threads << t
193
+ progress.increment # increment progress although thread is still running
194
+ end
195
+ # Ensure any remaining threads complete
196
+ threads.each{|t| t.join}
197
+ else
198
+ auth_records.each do |r|
199
+ marc_record2turtle(r, output_path)
200
+ progress.increment
201
+ end
202
+ end
112
203
  end
113
204
 
114
205
 
@@ -0,0 +1,68 @@
1
+ #!/bin/bash
2
+
3
+ export DEBUG=false
4
+
5
+ # Runs a lot slower on jruby, even with threading enabled.
6
+ export JRUBY_OPTS=-J-Xmx2g
7
+
8
+ export THREAD=false
9
+ export GET_LOC=false # if this is true, be prepared to wait a very long time!
10
+
11
+ export LOG_FILE='./log/marc2ld.log'
12
+ export LIB_PREFIX='http://www.linked-data.org/library/'
13
+
14
+ # Additional config options should be in .env;
15
+ # the .env values will not replace those above.
16
+ if [ ! -s .env ]; then
17
+ cp -u .env_example .env
18
+ fi
19
+
20
+ SCRIPT_FILE='.binstubs/marcAuthority2LD'
21
+ if [ ! -s ${SCRIPT_FILE} ]; then
22
+ echo "Cannot locate script: $SCRIPT_FILE"
23
+ exit 1
24
+ fi
25
+
26
+ AUTH_FILE="./data/auth.mrc"
27
+ AUTH_PATH="./data/auth_turtle/"
28
+ if [ ! -s ${AUTH_FILE} ]; then
29
+ echo "Place a MARC21 authority file into: $AUTH_FILE"
30
+ exit 1
31
+ fi
32
+
33
+ ${SCRIPT_FILE} ${AUTH_FILE}
34
+
35
+ DATA_LOG_FILE="./log/run_test_data.log"
36
+ echo -e "\n\n" > ${DATA_LOG_FILE}
37
+
38
+ echo -e "Output file count should be 100000:" >> ${DATA_LOG_FILE}
39
+ find ${AUTH_PATH} -type f | wc -l >> ${DATA_LOG_FILE}
40
+ echo -e "\n\n" > ${DATA_LOG_FILE}
41
+
42
+ # count all the different types of authority files
43
+ echo -e "Different types of authority records:\n" >> ${DATA_LOG_FILE}
44
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | sed -e 's/^.*> a/a/' | sort -u >> ${DATA_LOG_FILE}
45
+ echo -e "\n\n" > ${DATA_LOG_FILE}
46
+
47
+ echo -e "Count for 'Person' authority records:" >> ${DATA_LOG_FILE}
48
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Person' >> ${DATA_LOG_FILE}
49
+ echo -e "Count for 'Organization' authority records:" >> ${DATA_LOG_FILE}
50
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Organization' >> ${DATA_LOG_FILE}
51
+ echo -e "Count for 'Place' authority records:" >> ${DATA_LOG_FILE}
52
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'Place' >> ${DATA_LOG_FILE}
53
+ echo -e "Count for 'event' authority records:" >> ${DATA_LOG_FILE}
54
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'event' >> ${DATA_LOG_FILE}
55
+ echo -e "Count for 'v1#NameTitle' authority records:" >> ${DATA_LOG_FILE}
56
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'v1#NameTitle' >> ${DATA_LOG_FILE}
57
+ echo -e "Count for 'v1#Title' authority records:" >> ${DATA_LOG_FILE}
58
+ find ${AUTH_PATH} -type f | xargs grep 'linked-data' | grep -c -F 'v1#Title' >> ${DATA_LOG_FILE}
59
+
60
+ # # check the syntax of the output files
61
+ # echo -e "\n\n\n" >> ${DATA_LOG_FILE}
62
+ # for f in $(find ${AUTH_PATH} -type f); do
63
+ # rapper -c -i turtle $f >> ${DATA_LOG_FILE} 2>&1
64
+ # done
65
+
66
+ # cleanup
67
+ rm -rf ${AUTH_PATH}
68
+