traject_horizon 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +173 -0
- data/Rakefile +11 -0
- data/lib/traject/horizon_bib_auth_merge.rb +124 -0
- data/lib/traject/horizon_reader.rb +641 -0
- data/lib/traject_horizon.rb +6 -0
- data/lib/traject_horizon/version.rb +3 -0
- data/test/horizon_bib_auth_merge_test.rb +58 -0
- data/test/test_helper.rb +16 -0
- data/traject_horizon.gemspec +24 -0
- data/vendor/jtds/.DS_Store +0 -0
- data/vendor/jtds/jtds-1.2.8.jar +0 -0
- metadata +110 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jonathan Rochkind
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
# Traject::Horizon
|
2
|
+
|
3
|
+
Export MARC records directly from a Horizon ILS rdbms, either as serialized MARC,
|
4
|
+
or to then index to Solr.
|
5
|
+
|
6
|
+
traject-horizon is a plugin for [traject](http://github.com/jrochkind/traject), and
|
7
|
+
requires jruby to be installed.
|
8
|
+
|
9
|
+
Supports embedding copy/item holdings information in exported MARC.
|
10
|
+
|
11
|
+
Fairly high-performance, should have higher throughput than most existing
|
12
|
+
Horizon MARC export options, including the vendor-supplied Windows-only
|
13
|
+
'marcout'. There are probably opportunities for increasing performance
|
14
|
+
yet further with more development of multi-threaded processing.
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
traject_horizon is a plugin for [traject](http://github.com/jrochkind/traject), install
|
19
|
+
them both:
|
20
|
+
|
21
|
+
$ gem install traject traject_horizon
|
22
|
+
|
23
|
+
### Or, if using a Gemfile with your traject project
|
24
|
+
|
25
|
+
Add this line to your [traject project's Gemfile](https://github.com/jrochkind/traject/blob/master/doc/extending.md#or-with-bundler):
|
26
|
+
|
27
|
+
gem 'traject_horizon'
|
28
|
+
|
29
|
+
And then execute:
|
30
|
+
|
31
|
+
$ bundle install
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
I recommend creating a seperate traject configuration file just with
|
36
|
+
settings for the Horizon export.
|
37
|
+
|
38
|
+
~~~ruby
|
39
|
+
# horizon_conf.rb
|
40
|
+
|
41
|
+
# Require traject/horizon to load the gem, including
|
42
|
+
# the Traject::HorizonReader we'll subsequently
|
43
|
+
# configure to be used
|
44
|
+
require 'traject/horizon'
|
45
|
+
|
46
|
+
settings do
|
47
|
+
store "reader_class_name", "Traject::HorizonReader"
|
48
|
+
|
49
|
+
# JDBC URL starting with "jdbc:jtds", and either "sybase:"
|
50
|
+
# or "sqlserver:", including username on the end but not password:
|
51
|
+
provide "horizon.jdbc_url", "jdbc:jtds:sybase://horizonserver.university.edu:2025/horizon_db;user=esys"
|
52
|
+
|
53
|
+
# DB password in seperate setting
|
54
|
+
provide "horizon.jdbc_password", "drilg53"
|
55
|
+
|
56
|
+
# Do you want to include copy/item holdings information?
|
57
|
+
# this setting says to include "top-level" holdings,
|
58
|
+
# copy or item but not both. Holdings will be included
|
59
|
+
# in tags 991 and 937, although the tags and nature
|
60
|
+
# of included holdings is configurable.
|
61
|
+
provide "horizon.include_holdings", "direct"
|
62
|
+
|
63
|
+
# Would you like to exclude certain tags from
|
64
|
+
# your Horizon db? If you are including holdings,
|
65
|
+
# then it's recommended to exclude 991 and 937 to
|
66
|
+
# avoid any collision with the tags we add to represent holdings.
|
67
|
+
provide "horizon.exclude_tags", "991,937"
|
68
|
+
end
|
69
|
+
~~~
|
70
|
+
|
71
|
+
There are a variety of additional settings that apply to the HorizonReader,
|
72
|
+
especially settings for customizing the item/copy holdings information
|
73
|
+
included. See [HorizonReader] inline comment docs.
|
74
|
+
|
75
|
+
Note by default `staff-only` records are _not_ included in the export,
|
76
|
+
but this can be changed in settings.
|
77
|
+
|
78
|
+
As with all traject settings, string-valued settings can also be supplied
|
79
|
+
on the traject command line with `-s setting=value`.
|
80
|
+
|
81
|
+
### Export MARC records
|
82
|
+
|
83
|
+
$ traject -x marcout -c horizon_conf.rb -o marc_files.marc
|
84
|
+
|
85
|
+
That will export your entire horizon database,,
|
86
|
+
using the connection details and configuration from horizon_conf.rb, exporting
|
87
|
+
in ISO 2709 binary format to `marc_files.marc`.
|
88
|
+
|
89
|
+
You can also specify specific ranges of bib#'s to export:
|
90
|
+
|
91
|
+
$ traject -x marcout -c horizon_conf.rb -o marc_files.marc -s horizon.first_bib=10000 -s horizon.last_bib=10100
|
92
|
+
$ traject -x marcout -c horizon_conf.rb -o marc_files.marc -s horizon.only_bib=12345
|
93
|
+
|
94
|
+
You can export in MarcXML, or in a human readable format for debuging,
|
95
|
+
using standard traject `-x marcout` functionality:
|
96
|
+
|
97
|
+
$ traject -x marcout -c horizon_conf.rb -s marcout.type=xml -o marc_files.xml
|
98
|
+
|
99
|
+
# leave off the `-o` argument to write to stdout, and view bib# 12345 in
|
100
|
+
# human-readable format:
|
101
|
+
$ traject -x marcout -c horizon_conf.rb -s marcout.type=human -s horizon.only_bib=12345
|
102
|
+
|
103
|
+
### Indexing records to solr
|
104
|
+
|
105
|
+
Traject is primarily a tool for indexing to solr. You can use `traject-horizon` to
|
106
|
+
export from Horizon and send directly through the indexing pipeline, without
|
107
|
+
having to serialize MARC to disk first.
|
108
|
+
|
109
|
+
You would have one or more additional traject configuration files specifying
|
110
|
+
your indexing mapping rules, and Solr connection details. See traject
|
111
|
+
documentation.
|
112
|
+
|
113
|
+
Then, simply:
|
114
|
+
|
115
|
+
$ traject -c horizon_conf.rb -c other_traject_conf.rb
|
116
|
+
|
117
|
+
## Note on character encodings
|
118
|
+
|
119
|
+
By default, traject-horizon assumes the data in your Horizon database is stored
|
120
|
+
in the Marc8 encoding. (I think this is true of all Horizon databases?). And by
|
121
|
+
default, traject-horizon will transcode it to UTF-8, marking leader byte 9 in any
|
122
|
+
exported MARC appropriately (Using the Marc4J AnselConverter class).
|
123
|
+
|
124
|
+
If you'd like traject to avoid this transcode, you can set the traject
|
125
|
+
setting `horizon.destination_encoding` to nil or the empty string, either
|
126
|
+
on the command line:
|
127
|
+
|
128
|
+
traject -x marcout -s horizon.destination_encoding= -c horizon_conf.rb
|
129
|
+
|
130
|
+
Or in your traject configuration file:
|
131
|
+
|
132
|
+
settings do
|
133
|
+
#...
|
134
|
+
provide "horizon.destination_encoding", nil
|
135
|
+
end
|
136
|
+
|
137
|
+
You might want to do this with `marcout` use, perhaps for diagnostics, but
|
138
|
+
it shouldn't ever be appropriate for indexing-to-solr use, as there are limited
|
139
|
+
facilities for dealing with Marc8 encoding in ruby.
|
140
|
+
|
141
|
+
Currently, item/copy information may not be treated entirely consistent here,
|
142
|
+
there may be edge-case encoding bugs related to non-ascii item/copy notes etc,
|
143
|
+
and it may not be possible to output them in Marc8. Sorry.
|
144
|
+
|
145
|
+
## Challenges
|
146
|
+
|
147
|
+
I had to reverse engineer the Horizon database to figure out how to turn it into
|
148
|
+
MARC records. I believe I have been succesful, and traject-horizon seems to produce
|
149
|
+
the same output as Horizon's own marcout.
|
150
|
+
|
151
|
+
Hopefully this will remain true in future Horizon versions, I don't think relevant
|
152
|
+
aspects of Horizon architecture change very much, but it's always a risk.
|
153
|
+
|
154
|
+
The two biggest challenges were dealing with character encoding, and dealing
|
155
|
+
with merging information from the Horizon bib and auth tables.
|
156
|
+
|
157
|
+
The translation from Marc8 to UTF8 appears to work properly, _except_
|
158
|
+
some known issues with item/copy holding information. item/copy holding
|
159
|
+
information may occasionally not transcode properly, and it may not
|
160
|
+
be possible to keep item/copy holding info in Marc8. If these become
|
161
|
+
an actual problem in practice for anyone, further development can
|
162
|
+
probably resolve these issues.
|
163
|
+
|
164
|
+
|
165
|
+
## Development
|
166
|
+
|
167
|
+
There is only limited test coverage at the moment, sorry. I couldn't
|
168
|
+
quite figure out how to easily provide test coverage when so much
|
169
|
+
functionality interacts with a Horizon database.
|
170
|
+
|
171
|
+
There is some test coverage of the bib/auth merging routines.
|
172
|
+
|
173
|
+
Test are provided with minitest, and can be run with `rake test`.
|
data/Rakefile
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
module Traject
|
2
|
+
|
3
|
+
# Merges 'bib text' and 'auth text' lines from Horizon, using bib text as
|
4
|
+
# template when neccesary.
|
5
|
+
#
|
6
|
+
# merged_str = HorizonBibAuthMerge.new(tag, bib_text_str, auth_text_str).merge!
|
7
|
+
#
|
8
|
+
# Strings passed in may be mutated for efficiency. So you can only call merge! once, it's just
|
9
|
+
# utility.
|
10
|
+
class HorizonBibAuthMerge
|
11
|
+
attr_reader :bibtext, :authtext, :tag
|
12
|
+
|
13
|
+
# Pass in bibtext and authtext as String -- you probably need to get
|
14
|
+
# column values from JDBC as bytes and then use String.from_java_bytes
|
15
|
+
# to avoid messing up possible Marc8 encoding.
|
16
|
+
#
|
17
|
+
# bibtext is either text or longtext column from fullbib, preferring
|
18
|
+
# longtext. authtext is either xref_text or xref_longtext from fullbib,
|
19
|
+
# preferring xref_longtext.
|
20
|
+
def initialize(tag, bibtext, authtext)
|
21
|
+
@merged = false
|
22
|
+
|
23
|
+
@tag = tag
|
24
|
+
@bibtext = bibtext
|
25
|
+
@authtext = authtext
|
26
|
+
|
27
|
+
# remove terminal MARC Field Terminator if present.
|
28
|
+
@bibtext.chomp!("\x1E") if @bibtext
|
29
|
+
@authtext.chomp!("\x1E") if @authtext
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns merged string, composed of a marc 'field', with subfields
|
33
|
+
# seperated by seperator control chars. Does not include terminal
|
34
|
+
# MARC Field Seperator.
|
35
|
+
#
|
36
|
+
# Will mutate bibtext and authtext for efficiency.
|
37
|
+
def merge!
|
38
|
+
raise Exception.new("Can only call `merge!` once, already called.") if @merged
|
39
|
+
@merged = true
|
40
|
+
|
41
|
+
# just one? (Or neither?) Just return it.
|
42
|
+
return authtext if bibtext.nil?
|
43
|
+
return bibtext if authtext.nil?
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
# We need to do a crazy combination of template in text with values in authtext.
|
48
|
+
# horizon, you so crazy. text template is like:
|
49
|
+
#"\x1Fa.\x1Fp ;\x1Fv81."
|
50
|
+
# which means each subfield after the \x1F, merge in
|
51
|
+
# the subfield value from the auth record if it's present,
|
52
|
+
# otherwise don't.
|
53
|
+
#
|
54
|
+
# plus some weird as hell stuff with punctuation and spaces, I can't
|
55
|
+
# even explain it, just trial and error'd it comparing to marcout.
|
56
|
+
bibtext.gsub!(/\x1F([^\x1F\x1E])( ?)([[:punct:] ]*)/) do
|
57
|
+
|
58
|
+
subfield = $1
|
59
|
+
space = $2
|
60
|
+
maybe_punct = $3
|
61
|
+
|
62
|
+
|
63
|
+
# okay this is crazy hacky reverse engineering, I don't really
|
64
|
+
# know what's going on but for 240 and 243, 'a' in template
|
65
|
+
# is filled by 't' in auth tag.
|
66
|
+
auth_subfield = if subfield == "a" && (tag == "240" || tag == "243")
|
67
|
+
"t"
|
68
|
+
else
|
69
|
+
subfield
|
70
|
+
end
|
71
|
+
|
72
|
+
# Find substitute fill-in value from authtext, if it can
|
73
|
+
# be found -- first subfield indicated. Then we REMOVE
|
74
|
+
# it from authtext, so next time this subfield is asked for,
|
75
|
+
# subsequent subfield with that code will be used.
|
76
|
+
substitute = nil
|
77
|
+
authtext.sub!(/\x1F#{auth_subfield}([^\x1F\x1E]*)/) do
|
78
|
+
substitute = $1
|
79
|
+
''
|
80
|
+
end
|
81
|
+
|
82
|
+
if substitute
|
83
|
+
|
84
|
+
|
85
|
+
# Dealing with punctuation is REALLY CONFUSING -- reverse engineering
|
86
|
+
# HIP/Horizon, which does WEIRD THINGS.
|
87
|
+
# But we seem to have arrived at something that appears to match all cases
|
88
|
+
# we can find of what HIP/Horizon does.
|
89
|
+
#
|
90
|
+
# If the auth value already ends up with the same punctuation from the template,
|
91
|
+
# _leave it alone_ -- including preserving all spaces near the punct in the auth
|
92
|
+
# value.
|
93
|
+
#
|
94
|
+
# Otherwise, remove all punct from the auth value, then add in the punct from the template,
|
95
|
+
# along with any spaces before the punct in the template.
|
96
|
+
if maybe_punct && maybe_punct.length > 0
|
97
|
+
# remove all punctuation from end of auth value? to use punct from template instead?
|
98
|
+
# But preserve initial spaces from template? Unless it already ends
|
99
|
+
# with the punctuation, in which case don't touch it, to avoid
|
100
|
+
# messing up spaces? WEIRD, yeah.
|
101
|
+
unless substitute.end_with? maybe_punct
|
102
|
+
substitute.gsub!(/[[:punct:]]+\Z/, "")
|
103
|
+
# This adding the #{space} back in, is consistent with what HIP does.
|
104
|
+
# I have no idea if it's right or a bug in HIP, but being consistent.
|
105
|
+
# neither leaving it in nor taking it out is exactly consistent with HznExportMarc,
|
106
|
+
# which seems to have bugs.
|
107
|
+
substitute << "#{space}#{maybe_punct}"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
"\x1F#{subfield}#{substitute}"
|
112
|
+
else # just keep original, which has no maybe_punct
|
113
|
+
"\x1F#{subfield}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# We mutated bibtext to fill in template, now just return it.
|
118
|
+
return bibtext
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,641 @@
|
|
1
|
+
require 'traject'
|
2
|
+
require 'traject/util'
|
3
|
+
require 'traject/indexer/settings'
|
4
|
+
|
5
|
+
require 'traject/horizon_bib_auth_merge'
|
6
|
+
|
7
|
+
require 'marc'
|
8
|
+
|
9
|
+
module Traject
|
10
|
+
#
|
11
|
+
# = Settings
|
12
|
+
#
|
13
|
+
# == Connection
|
14
|
+
#
|
15
|
+
# [horizon.jdbc_url] JDBC connection URL using jtds. Should include username, but not password.
|
16
|
+
# See `horizon.jdbc_password` setting, kept seperate so we can try to suppress
|
17
|
+
# it from logging. Eg: "jdbc:jtds:sybase://horizon.lib.univ.edu:2025/dbname;user=dbuser"
|
18
|
+
# * In command line, you'll have to use quotes: -s 'horizon.jdbc_url=jdbc:jtds:sybase://horizon.lib.univ.edu:2025/dbname;user=dbuser'
|
19
|
+
#
|
20
|
+
# [horizon.jdbc_password] Password to use for JDBC connection. We'll try to suppress it from being logged.
|
21
|
+
#
|
22
|
+
# == What to export
|
23
|
+
#
|
24
|
+
# Normally exports the entire horizon database, for diagnostic or batch purposes you
|
25
|
+
# can export just one bib, or a range of bibs instead.
|
26
|
+
#
|
27
|
+
# [horizon.first_bib] Greater than equal to this bib number. Can be combined with horizon.last_bib
|
28
|
+
# [horizon.last_bib] Less than or equal to this bib number. Can be combined with horizon.first_bib
|
29
|
+
# [horizon.only_bib] Only this single bib number.
|
30
|
+
#
|
31
|
+
# You can also control whether to export staff-only bibs, copies, and items.
|
32
|
+
#
|
33
|
+
# [horizon.public_only] Default true. If set to true, only includes bibs that are NOT staff_only,
|
34
|
+
# also only include copy/item that are not staff-only if including copy/item.
|
35
|
+
#
|
36
|
+
# You can also exclude certain tags:
|
37
|
+
#
|
38
|
+
# [horizon.exclude_tags] Default nil. A comma-seperated string (so easy to supply on command line)
|
39
|
+
# of tag names to exclude from export. You probably want to at least include the tags
|
40
|
+
# you are using for horizon.item_tag and horizon.copy_tag, to avoid collision
|
41
|
+
# from tags already in record.
|
42
|
+
#
|
43
|
+
# == Item/Copy Inclusion
|
44
|
+
#
|
45
|
+
# The HorizonReader can export MARC with holdings information (horizon items and copies) included
|
46
|
+
# in the MARC. Each item or copy record will be represented as one marc field -- the tags
|
47
|
+
# used are configurable. You can configure how individual columns from item or copy tables
|
48
|
+
# map to MARC subfields in that field -- and also include columns from other tables joined
|
49
|
+
# to item or copy.
|
50
|
+
#
|
51
|
+
# [horizon.include_holdings] * false, nil, or empty string: Do not include holdings. (DEFAULT)
|
52
|
+
# * all: include copies and items
|
53
|
+
# * items: only include items
|
54
|
+
# * copies: only include copies
|
55
|
+
# * direct: only include copies OR items, but not both; if bib has
|
56
|
+
# include copies, otherwise include items if present.
|
57
|
+
#
|
58
|
+
# Each item or copy will be one marc field, you can configure what tags these fields
|
59
|
+
# will have.
|
60
|
+
#
|
61
|
+
# [horizon.item_tag] Default "991".
|
62
|
+
# [horizon.copy_tag] Default "937"
|
63
|
+
#
|
64
|
+
# Which columns from item or copy tables will be mapped to which subfields in those
|
65
|
+
# fields is controlled by hashes in settings, hash from column name (with table prefix)
|
66
|
+
# to subfield code. There are defaults, see HorizonReader.default_settings. Example for
|
67
|
+
# item_map default:
|
68
|
+
#
|
69
|
+
# "horizon.item_map" => {
|
70
|
+
# "item.call_reconstructed" => "a",
|
71
|
+
# "collection.call_type" => "b",
|
72
|
+
# "item.copy_reconstructed" => "c",
|
73
|
+
# "call_type.processor" => "f",
|
74
|
+
# "item.item#" => "i",
|
75
|
+
# "item.collection" => "l",
|
76
|
+
# "item.location" => "m",
|
77
|
+
# "item.notes" => "n",
|
78
|
+
# "item.staff_only" => "q"
|
79
|
+
# }
|
80
|
+
#
|
81
|
+
# [horizon.item_map]
|
82
|
+
# [horizon.copy_map]
|
83
|
+
#
|
84
|
+
# The column-to-subfield maps can include columns from other tables
|
85
|
+
# joined in, with a join clause configured in settings too.
|
86
|
+
# By default both item and copy join to: collection, and call_type --
|
87
|
+
# using some clever SQL to join to call_type on the item/copy fk, OR the
|
88
|
+
# associated collection fk if no specific item/copy one is defined.
|
89
|
+
#
|
90
|
+
# [horizon.item_join_clause]
|
91
|
+
# [horizon.copy_join_clause]
|
92
|
+
#
|
93
|
+
# == Character Encoding
|
94
|
+
#
|
95
|
+
# The HorizonReader can convert from Marc8 to UTF8. By default `horizon.source_encoding` is set to "MARC8"
|
96
|
+
# and `horizon.destination_encoding` is set to "UTF8", which will make it do that conversion, as well
|
97
|
+
# as set the leader byte for char encoding properly.
|
98
|
+
#
|
99
|
+
# Any other configuration of those settings, and no transcoding will take place, HorizonReader
|
100
|
+
# is not currently capable of doing any other transcoding. Set
|
101
|
+
# or `horizon.destination_encoding` to nil if you don't want any transcoding to happen --
|
102
|
+
# you'd only want this for diagnostic purposes, or if your horizon db is already utf8 (is
|
103
|
+
# that possible? We don't know.)
|
104
|
+
#
|
105
|
+
# [horizon.codepoint_translate] translates from Horizon's weird <U+nnnn> unicode
|
106
|
+
# codepoint escaping to actual UTF-8 bytes. Defaults to true. Will be ignored
|
107
|
+
# unless horizon.destination_encoding is UTF8 though.
|
108
|
+
#
|
109
|
+
# == Misc
|
110
|
+
#
|
111
|
+
# [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
|
112
|
+
# [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
|
113
|
+
# subsidiary fetch. See description of this setting in docs/settings.md
|
114
|
+
#
|
115
|
+
# [jtds.jar_path] Normally we'll use a distribution of jtds bundled with this gem.
|
116
|
+
# But specify a filepath of a directory containing jtds jar(s),
|
117
|
+
# and all jars in that dir will be loaded instead of our bundled jtds.
|
118
|
+
#
|
119
|
+
#
|
120
|
+
# Note: Could probably make this even faster by using a thread pool -- the bottleneck
|
121
|
+
# is probably processing into MARC, not the database query and streaming. But it's a
|
122
|
+
# bit tricky to refactor for concurrency there. Perhaps pull all the raw
|
123
|
+
# row values out and batch them in groups by bib#, then feed those lists
|
124
|
+
# to a threadpool. And then we'd just be fighting for CPU time with the
|
125
|
+
# threadpool for mapping, not sure if overall throughput increase would happen, would
|
126
|
+
# depend on exact environment.
|
127
|
+
class HorizonReader
|
128
|
+
attr_reader :settings
|
129
|
+
attr_reader :things_to_close
|
130
|
+
|
131
|
+
# We ignore the iostream even though we get one, we're gonna
|
132
|
+
# read from a Horizon DB!
|
133
|
+
def initialize(iostream, settings)
|
134
|
+
# we ignore the iostream, we're fetching from Horizon db
|
135
|
+
|
136
|
+
@settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
|
137
|
+
|
138
|
+
require_jars!
|
139
|
+
end
|
140
|
+
|
141
|
+
# Requires marc4j and jtds, and java_import's some classes.
|
142
|
+
def require_jars!
|
143
|
+
Traject::Util.jruby_ensure_init!("Traject::HorizonReader")
|
144
|
+
|
145
|
+
Traject::Util.require_marc4j_jars(settings)
|
146
|
+
|
147
|
+
# For some reason we seem to need to java_import it, and use
|
148
|
+
# a string like this. can't just refer to it by full
|
149
|
+
# qualified name, not sure why, but this seems to work.
|
150
|
+
java_import "org.marc4j.converter.impl.AnselToUnicode"
|
151
|
+
|
152
|
+
unless defined? Java::net.sourceforge.jtds.jdbc.Driver
|
153
|
+
jtds_jar_dir = settings["jtds.jar_path"] || File.expand_path("../../vendor/jtds", File.dirname(__FILE__))
|
154
|
+
|
155
|
+
Dir.glob("#{jtds_jar_dir}/*.jar") do |x|
|
156
|
+
require x
|
157
|
+
end
|
158
|
+
|
159
|
+
# For confusing reasons, in normal Java need to
|
160
|
+
# Class.forName("net.sourceforge.jtds.jdbc.Driver")
|
161
|
+
# to get the jtds driver to actually be recognized by JDBC.
|
162
|
+
#
|
163
|
+
# In Jruby, Class.forName doesn't work, but this seems
|
164
|
+
# to do the same thing:
|
165
|
+
Java::net.sourceforge.jtds.jdbc.Driver
|
166
|
+
end
|
167
|
+
|
168
|
+
# So we can refer to these classes as just ResultSet, etc.
|
169
|
+
java_import java.sql.ResultSet, java.sql.PreparedStatement, java.sql.Driver
|
170
|
+
end
|
171
|
+
|
172
|
+
def fetch_result_set!(conn)
|
173
|
+
#fullbib is a view in Horizon, I think it was an SD default view, that pulls
|
174
|
+
#in stuff from multiple tables, including authority tables, to get actual
|
175
|
+
# text.
|
176
|
+
# You might think need an ORDER BY, but doing so makes it incredibly slow
|
177
|
+
# to retrieve results, can't do it. We just count on the view returning
|
178
|
+
# the rows properly. (ORDER BY bib#, tagord)
|
179
|
+
#
|
180
|
+
# We start with the fullbib view defined out of the box in Horizon, but
|
181
|
+
# need to join in bib_control to have access to the staff_only column.
|
182
|
+
#
|
183
|
+
sql = <<-EOS
|
184
|
+
SELECT b.bib#, b.tagord, b.tag,
|
185
|
+
indicators = substring(b.indicators+' ',1,2)+a.indicators,
|
186
|
+
b.text, b.cat_link_type#, b.cat_link_xref#, b.link_type,
|
187
|
+
bl.longtext, xref_text = a.text, xref_longtext = al.longtext,
|
188
|
+
b.timestamp, auth_timestamp = a.timestamp,
|
189
|
+
bc.staff_only
|
190
|
+
FROM bib b
|
191
|
+
left join bib_control bc on b.bib# = bc.bib#
|
192
|
+
left join bib_longtext bl on b.bib# = bl.bib# and b.tag = bl.tag and b.tagord = bl.tagord
|
193
|
+
left join auth a on b.cat_link_xref# = a.auth# and a.tag like '1[0-9][0-9]'
|
194
|
+
left join auth_longtext al on b.cat_link_xref# = al.auth# and al.tag like '1[0-9][0-9]'
|
195
|
+
WHERE 1 = 1
|
196
|
+
EOS
|
197
|
+
|
198
|
+
sql = <<-EOS
|
199
|
+
SELECT b.*, bc.staff_only
|
200
|
+
FROM fullbib b
|
201
|
+
JOIN bib_control bc on b.bib# = bc.bib#
|
202
|
+
WHERE 1 = 1
|
203
|
+
EOS
|
204
|
+
|
205
|
+
if settings["horizon.public_only"].to_s == "true"
|
206
|
+
sql += " AND staff_only != 1"
|
207
|
+
end
|
208
|
+
|
209
|
+
# settings should not be coming from untrusted user input not going
|
210
|
+
# to bother worrying about sql injection.
|
211
|
+
if settings.has_key? "horizon.only_bib"
|
212
|
+
sql += " AND b.bib# = #{settings['horizon.only_bib']} "
|
213
|
+
elsif settings.has_key?("horizon.first_bib") || settings.has_key?("horizon.last_bib")
|
214
|
+
clauses = []
|
215
|
+
clauses << " b.bib# >= #{settings['horizon.first_bib']}" if settings['horizon.first_bib']
|
216
|
+
clauses << " b.bib# <= #{settings['horizon.last_bib']}" if settings['horizon.last_bib']
|
217
|
+
sql += " AND " + clauses.join(" AND ") + " "
|
218
|
+
end
|
219
|
+
|
220
|
+
pstmt = conn.prepareStatement(sql);
|
221
|
+
|
222
|
+
# this may be what's neccesary to keep the driver from fetching
|
223
|
+
# entire result set into memory.
|
224
|
+
pstmt.setFetchSize(10000)
|
225
|
+
|
226
|
+
|
227
|
+
logger.debug("HorizonReader: Executing query: #{sql}")
|
228
|
+
rs = pstmt.executeQuery
|
229
|
+
logger.debug("HorizonReader: Executed!")
|
230
|
+
return rs
|
231
|
+
end
|
232
|
+
|
233
|
+
# Converts from Marc8 to UTF8 if neccesary.
|
234
|
+
# Also replaces horizon <U+nnnn> codes if needed.
|
235
|
+
def convert_text!(text, error_handler)
|
236
|
+
text = AnselToUnicode.new(error_handler, true).convert(text) if convert_marc8_to_utf8?
|
237
|
+
|
238
|
+
# Turn Horizon's weird escaping into UTF8: <U+nnnn> where nnnn is a hex unicode
|
239
|
+
# codepoint, turn it UTF8 for that codepoint
|
240
|
+
if settings["horizon.codepoint_translate"].to_s == "true" && settings["horizon.destination_encoding"] == "UTF8"
|
241
|
+
text.gsub!(/\<U\+([0-9A-F]{4})\>/) do
|
242
|
+
[$1.hex].pack("U")
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
return text
|
247
|
+
end
|
248
|
+
|
249
|
+
# Read rows from horizon database, assemble them into MARC::Record's, and yield each
|
250
|
+
# MARC::Record to caller.
|
251
|
+
def each
|
252
|
+
# Need to close the connection, teh result_set, AND the result_set.getStatement when
|
253
|
+
# we're done.
|
254
|
+
connection = open_connection!
|
255
|
+
|
256
|
+
# We're going to need to ask for item/copy info while in the
|
257
|
+
# middle of streaming our results. JDBC is happier and more performant
|
258
|
+
# if we use a seperate connection for this.
|
259
|
+
extra_connection = open_connection! if include_some_holdings?
|
260
|
+
|
261
|
+
# We're going to make our marc records in batches, and only yield
|
262
|
+
# them to caller in batches, so we can fetch copy/item info in batches
|
263
|
+
# for efficiency.
|
264
|
+
batch_size = settings["horizon.batch_size"]
|
265
|
+
record_batch = []
|
266
|
+
|
267
|
+
exclude_tags = (settings["horizon.exclude_tags"] || "").split(",")
|
268
|
+
|
269
|
+
|
270
|
+
rs = self.fetch_result_set!(connection)
|
271
|
+
|
272
|
+
current_bib_id = nil
|
273
|
+
record = nil
|
274
|
+
record_count = 0
|
275
|
+
|
276
|
+
error_handler = org.marc4j.ErrorHandler.new
|
277
|
+
|
278
|
+
while(rs.next)
|
279
|
+
bib_id = rs.getInt("bib#");
|
280
|
+
|
281
|
+
if bib_id != current_bib_id
|
282
|
+
record_count += 1
|
283
|
+
|
284
|
+
if settings["debug_ascii_progress"] && (record_count % settings["solrj_writer.batch_size"] == 0)
|
285
|
+
$stderr.write ","
|
286
|
+
end
|
287
|
+
|
288
|
+
# new record! Put old one on batch queue.
|
289
|
+
record_batch << record if record
|
290
|
+
|
291
|
+
# prepare and yield batch?
|
292
|
+
if (record_count % batch_size == 0)
|
293
|
+
enhance_batch!(extra_connection, record_batch)
|
294
|
+
record_batch.each do |r|
|
295
|
+
# set current_bib_id for error logging
|
296
|
+
current_bib_id = r['001'].value
|
297
|
+
yield r
|
298
|
+
end
|
299
|
+
record_batch.clear
|
300
|
+
end
|
301
|
+
|
302
|
+
# And start new record we've encountered.
|
303
|
+
error_handler = org.marc4j.ErrorHandler.new
|
304
|
+
current_bib_id = bib_id
|
305
|
+
record = MARC::Record.new
|
306
|
+
record.append MARC::ControlField.new("001", bib_id.to_s)
|
307
|
+
end
|
308
|
+
|
309
|
+
|
310
|
+
tagord = rs.getInt("tagord");
|
311
|
+
tag = rs.getString("tag")
|
312
|
+
|
313
|
+
# just silently skip it, some weird row in the horizon db, it happens.
|
314
|
+
# plus any of our exclude_tags.
|
315
|
+
next if tag.nil? || tag == "" || exclude_tags.include?(tag)
|
316
|
+
|
317
|
+
numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
|
318
|
+
|
319
|
+
indicators = rs.getString("indicators")
|
320
|
+
|
321
|
+
# a packed byte array could be in various columns, in order of preference...
|
322
|
+
# the xref stuff is joined in from the auth table
|
323
|
+
# Have to get it as bytes and then convert it to String to avoid JDBC messing
|
324
|
+
# up the encoding marc8 grr
|
325
|
+
authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
|
326
|
+
if authtext
|
327
|
+
authtext = String.from_java_bytes(authtext)
|
328
|
+
authtext.force_encoding("binary")
|
329
|
+
end
|
330
|
+
|
331
|
+
text = rs.getBytes("longtext") || rs.getBytes("text")
|
332
|
+
if text
|
333
|
+
text = String.from_java_bytes(text)
|
334
|
+
text.force_encoding("binary")
|
335
|
+
end
|
336
|
+
|
337
|
+
text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
|
338
|
+
|
339
|
+
next if text.nil? # sometimes there's nothing there, skip it.
|
340
|
+
|
341
|
+
# convert from MARC8 to UTF8 if needed
|
342
|
+
text = convert_text!(text, error_handler)
|
343
|
+
|
344
|
+
if numeric_tag && numeric_tag == 0
|
345
|
+
record.leader = text
|
346
|
+
fix_leader!(record.leader)
|
347
|
+
elsif numeric_tag && numeric_tag == 1
|
348
|
+
# nothing, we add the 001 ourselves first
|
349
|
+
elsif numeric_tag && numeric_tag < 10
|
350
|
+
# control field
|
351
|
+
record.append MARC::ControlField.new(tag, text )
|
352
|
+
else
|
353
|
+
# data field
|
354
|
+
indicator1 = indicators.slice(0)
|
355
|
+
indicator2 = indicators.slice(1)
|
356
|
+
|
357
|
+
data_field = MARC::DataField.new( tag, indicator1, indicator2 )
|
358
|
+
record.append data_field
|
359
|
+
|
360
|
+
subfields = text.split("\x1F")
|
361
|
+
|
362
|
+
subfields.each do |subfield|
|
363
|
+
next if subfield.empty?
|
364
|
+
|
365
|
+
subfield_code = subfield.slice(0)
|
366
|
+
subfield_text = subfield.slice(1, subfield.length)
|
367
|
+
|
368
|
+
data_field.append MARC::Subfield.new(subfield_code, subfield_text)
|
369
|
+
end
|
370
|
+
end
|
371
|
+
end
|
372
|
+
# last one
|
373
|
+
record_batch << record if record
|
374
|
+
|
375
|
+
# yield last batch
|
376
|
+
enhance_batch!(extra_connection, record_batch)
|
377
|
+
record_batch.each do |r|
|
378
|
+
# reset bib_id for error message logging
|
379
|
+
current_bib_id = (f = r['001']) && f.value
|
380
|
+
yield r
|
381
|
+
end
|
382
|
+
record_batch.clear
|
383
|
+
|
384
|
+
rescue Exception => e
|
385
|
+
logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
|
386
|
+
raise e
|
387
|
+
ensure
|
388
|
+
logger.info("HorizonReader: Closing all JDBC objects...")
|
389
|
+
|
390
|
+
# have to cancel the statement to keep us from waiting on entire
|
391
|
+
# result set when exception is raised in the middle of stream.
|
392
|
+
statement = rs && rs.getStatement
|
393
|
+
if statement
|
394
|
+
statement.cancel
|
395
|
+
statement.close
|
396
|
+
end
|
397
|
+
|
398
|
+
rs.close if rs
|
399
|
+
|
400
|
+
# shouldn't actually need to close the resultset and statement if we cancel, I think.
|
401
|
+
connection.close if connection
|
402
|
+
|
403
|
+
extra_connection.close if extra_connection
|
404
|
+
|
405
|
+
logger.info("HorizonReader: Closed JDBC objects")
|
406
|
+
end
|
407
|
+
|
408
|
+
def process_batch(batch)
|
409
|
+
|
410
|
+
end
|
411
|
+
|
412
|
+
# Pass in an array of MARC::Records', adds fields for copy and item
|
413
|
+
# info if so configured. Returns record_batch so you can chain if you want.
|
414
|
+
def enhance_batch!(conn, record_batch)
|
415
|
+
return record_batch if record_batch.nil? || record_batch.empty?
|
416
|
+
|
417
|
+
copy_info = get_joined_table(
|
418
|
+
conn, record_batch,
|
419
|
+
:table_name => "copy",
|
420
|
+
:column_map => settings['horizon.copy_map'],
|
421
|
+
:join_clause => settings['horizon.copy_join_clause'],
|
422
|
+
:public_only => (settings['horizon.public_only'].to_s == "true")
|
423
|
+
) if %w{all copies direct}.include? settings['horizon.include_holdings'].to_s
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
item_info = get_joined_table(
|
428
|
+
conn, record_batch,
|
429
|
+
:table_name => "item",
|
430
|
+
:column_map => settings['horizon.item_map'],
|
431
|
+
:join_clause => settings['horizon.item_join_clause'],
|
432
|
+
:public_only => (settings['horizon.public_only'].to_s == "true")
|
433
|
+
) if %w{all items direct}.include? settings['horizon.include_holdings'].to_s
|
434
|
+
|
435
|
+
|
436
|
+
|
437
|
+
if item_info || copy_info
|
438
|
+
record_batch.each do |record|
|
439
|
+
id = record['001'].value.to_s
|
440
|
+
record_copy_info = copy_info && copy_info[id]
|
441
|
+
record_item_info = item_info && item_info[id]
|
442
|
+
|
443
|
+
record_copy_info.each do |copy_row|
|
444
|
+
field = MARC::DataField.new( settings["horizon.copy_tag"] )
|
445
|
+
copy_row.each_pair do |subfield, value|
|
446
|
+
field.append MARC::Subfield.new(subfield, value)
|
447
|
+
end
|
448
|
+
record.append field
|
449
|
+
end if record_copy_info
|
450
|
+
|
451
|
+
record_item_info.each do |item_row|
|
452
|
+
field = MARC::DataField.new( settings["horizon.item_tag"] )
|
453
|
+
item_row.each_pair do |subfield, value|
|
454
|
+
field.append MARC::Subfield.new(subfield, value)
|
455
|
+
end
|
456
|
+
record.append field
|
457
|
+
end if record_item_info && ((settings['horizon.include_holdings'].to_s != "direct") || record_copy_info.empty?)
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
461
|
+
return record_batch
|
462
|
+
end
|
463
|
+
|
464
|
+
# Can be used to fetch a batch of subsidiary info from other tables:
|
465
|
+
# Used to fetch item or copy information. Can fetch with joins too.
|
466
|
+
# Usually called by passing in settings, but a literal call might look something
|
467
|
+
# like this for items:
|
468
|
+
#
|
469
|
+
# get_joined_table(jdbc_conn, array_of_marc_records,
|
470
|
+
# :table_name => "item",
|
471
|
+
# :column_map => {"item.item#" => "i", "call_type.processor" => "k"},
|
472
|
+
# :join_clause => "JOIN call_type ON item.call_type = call_type.call_type"
|
473
|
+
# )
|
474
|
+
#
|
475
|
+
# Returns a hash keyed by bibID, value is an array of hashes of subfield->value, eg:
|
476
|
+
#
|
477
|
+
# {'343434' => [
|
478
|
+
# {
|
479
|
+
# 'i' => "012124" # item.item#
|
480
|
+
# 'k' => 'lccn' # call_type.processor
|
481
|
+
# }
|
482
|
+
# ]
|
483
|
+
# }
|
484
|
+
#
|
485
|
+
# Can also pass in a `:public_only => true` option, will add on a staff_only != 1
|
486
|
+
# where clause, assumes primary table has a staff_only column.
|
487
|
+
def get_joined_table(conn, batch, options = {})
|
488
|
+
table_name = options[:table_name] or raise ArgumentError.new("Need a :table_name option")
|
489
|
+
column_map = options[:column_map] or raise ArgumentError.new("Need a :column_map option")
|
490
|
+
join_clause = options[:join_clause] || ""
|
491
|
+
public_only = options[:public_only]
|
492
|
+
|
493
|
+
|
494
|
+
results = Hash.new {|h, k| h[k] = [] }
|
495
|
+
|
496
|
+
bib_ids_joined = batch.collect do |record|
|
497
|
+
record['001'].value.to_s
|
498
|
+
end.join(",")
|
499
|
+
|
500
|
+
# We include the column name with prefix as an "AS", so we can fetch it out
|
501
|
+
# of the result set later just like that.
|
502
|
+
columns_clause = column_map.keys.collect {|c| "#{c} AS '#{c}'"}.join(",")
|
503
|
+
sql = <<-EOS
|
504
|
+
SELECT bib#, #{columns_clause}
|
505
|
+
FROM #{table_name}
|
506
|
+
#{join_clause}
|
507
|
+
WHERE bib# IN (#{bib_ids_joined})
|
508
|
+
EOS
|
509
|
+
|
510
|
+
if public_only
|
511
|
+
sql += " AND staff_only != 1"
|
512
|
+
end
|
513
|
+
|
514
|
+
$stderr.write "<" if settings["debug_ascii_progress"]
|
515
|
+
|
516
|
+
# It might be higher performance to refactor to re-use the same prepared statement
|
517
|
+
# for each item/copy fetch... but appears to be no great way to do that in JDBC3
|
518
|
+
# where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
|
519
|
+
pstmt = conn.prepareStatement(sql);
|
520
|
+
rs = pstmt.executeQuery
|
521
|
+
|
522
|
+
|
523
|
+
while (rs.next)
|
524
|
+
bib_id = rs.getString("bib#")
|
525
|
+
row_hash = {}
|
526
|
+
|
527
|
+
column_map.each_pair do |column, subfield|
|
528
|
+
value = rs.getString( column )
|
529
|
+
|
530
|
+
if value
|
531
|
+
# Okay, total hack to deal with the fact that holding notes
|
532
|
+
# seem to be in UTF8 even though records are in MARC... which
|
533
|
+
# ends up causing problems for exporting as marc8, which is
|
534
|
+
# handled kind of not very well anyway.
|
535
|
+
# I don't even totally understand what I'm doing, after 6 hours working on it,
|
536
|
+
# sorry, just a hack.
|
537
|
+
value.force_encoding("BINARY") unless settings["horizon.destination_encoding"] == "UTF8"
|
538
|
+
|
539
|
+
row_hash[subfield] = value
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
results[bib_id] << row_hash
|
544
|
+
end
|
545
|
+
|
546
|
+
return results
|
547
|
+
ensure
|
548
|
+
pstmt.cancel if pstmt
|
549
|
+
pstmt.close if pstmt
|
550
|
+
rs.close if rs
|
551
|
+
$stderr.write ">" if settings["debug_ascii_progress"]
|
552
|
+
end
|
553
|
+
|
554
|
+
# Mutate string passed in to fix leader bytes for marc21
|
555
|
+
def fix_leader!(leader)
|
556
|
+
if leader.length < 24
|
557
|
+
# pad it to 24 bytes, leader is supposed to be 24 bytes
|
558
|
+
leader.replace( leader.ljust(24, ' ') )
|
559
|
+
end
|
560
|
+
# http://www.loc.gov/marc/bibliographic/ecbdldrd.html
|
561
|
+
leader[10..11] = '22'
|
562
|
+
leader[20..23] = '4500'
|
563
|
+
|
564
|
+
if settings['horizon.destination_encoding'] == "UTF8"
|
565
|
+
leader[9] = 'a'
|
566
|
+
end
|
567
|
+
end
|
568
|
+
|
569
|
+
def include_some_holdings?
|
570
|
+
! [false, nil, ""].include?(settings['horizon.include_holdings'])
|
571
|
+
end
|
572
|
+
|
573
|
+
def convert_marc8_to_utf8?
|
574
|
+
settings['horizon.source_encoding'] == "MARC8" && settings['horizon.destination_encoding'] == "UTF8"
|
575
|
+
end
|
576
|
+
|
577
|
+
|
578
|
+
def open_connection!
|
579
|
+
logger.debug("HorizonReader: Opening JDBC Connection at #{settings["horizon.jdbc_url"]} ...")
|
580
|
+
|
581
|
+
url = settings["horizon.jdbc_url"]
|
582
|
+
if settings["horizon.jdbc_password"]
|
583
|
+
url += ";password=#{settings['horizon.jdbc_password']}"
|
584
|
+
end
|
585
|
+
|
586
|
+
conn = java.sql.DriverManager.getConnection( url )
|
587
|
+
# If autocommit on, fetchSize later has no effect, and JDBC slurps
|
588
|
+
# the whole result set into memory, which we can not handle.
|
589
|
+
conn.setAutoCommit false
|
590
|
+
logger.debug("HorizonReader: Opened JDBC Connection.")
|
591
|
+
return conn
|
592
|
+
end
|
593
|
+
|
594
|
+
def logger
|
595
|
+
settings["logger"] || Yell::Logger.new(STDERR, :level => "gt.fatal") # null logger
|
596
|
+
end
|
597
|
+
|
598
|
+
def self.default_settings
|
599
|
+
{
|
600
|
+
"horizon.batch_size" => 400,
|
601
|
+
|
602
|
+
"horizon.public_only" => true,
|
603
|
+
|
604
|
+
"horizon.source_encoding" => "MARC8",
|
605
|
+
"horizon.destination_encoding" => "UTF8",
|
606
|
+
"horizon.codepoint_translate" => true,
|
607
|
+
|
608
|
+
"horizon.item_tag" => "991",
|
609
|
+
# Crazy isnull() in the call_type join to join to call_type directly on item
|
610
|
+
# if specified otherwise calltype on colleciton. Phew!
|
611
|
+
"horizon.item_join_clause" => "LEFT OUTER JOIN collection ON item.collection = collection.collection LEFT OUTER JOIN call_type ON isnull(item.call_type, collection.call_type) = call_type.call_type",
|
612
|
+
"horizon.item_map" => {
|
613
|
+
"item.call_reconstructed" => "a",
|
614
|
+
"call_type.processor" => "f",
|
615
|
+
"call_type.call_type" => "b",
|
616
|
+
"item.copy_reconstructed" => "c",
|
617
|
+
"item.staff_only" => "q",
|
618
|
+
"item.item#" => "i",
|
619
|
+
"item.collection" => "l",
|
620
|
+
"item.notes" => "n",
|
621
|
+
"item.location" => "m"
|
622
|
+
},
|
623
|
+
|
624
|
+
"horizon.copy_tag" => "937",
|
625
|
+
# Crazy isnull() in the call_type join to join to call_type directly on item
|
626
|
+
# if specified otherwise calltype on colleciton. Phew!
|
627
|
+
"horizon.copy_join_clause" => "LEFT OUTER JOIN collection ON copy.collection = collection.collection LEFT OUTER JOIN call_type ON isnull(copy.call_type, collection.call_type) = call_type.call_type",
|
628
|
+
"horizon.copy_map" => {
|
629
|
+
"copy.copy#" => "8",
|
630
|
+
"copy.call" => "a",
|
631
|
+
"copy.copy_number" => "c",
|
632
|
+
"call_type.processor" => "f",
|
633
|
+
"copy.staff_only" => "q",
|
634
|
+
"copy.location" => "m",
|
635
|
+
"copy.collection" => "l",
|
636
|
+
"copy.pac_note" => "n"
|
637
|
+
}
|
638
|
+
}
|
639
|
+
end
|
640
|
+
end
|
641
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# Encoding: ASCII-8BIT
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
|
5
|
+
require 'traject'
|
6
|
+
require 'traject/horizon_bib_auth_merge'
|
7
|
+
|
8
|
+
describe "HorizonBibAuthMerge" do
|
9
|
+
HzMerge = Traject::HorizonBibAuthMerge # shortcut
|
10
|
+
|
11
|
+
it "does simple example" do
|
12
|
+
assert_equal "aOsmoregulationvCongresses.", HzMerge.new("650", "a v.", "aOsmoregulationvCongresses.").merge!
|
13
|
+
end
|
14
|
+
|
15
|
+
it "adds on simple trailing punctuation" do
|
16
|
+
assert_equal "aHomeostasisvCongresses.", HzMerge.new("650", "a v.", "aHomeostasisvCongresses").merge!
|
17
|
+
end
|
18
|
+
|
19
|
+
it "handles weirder punctuation" do
|
20
|
+
assert_equal "aEastaugh, Steven R.,d1952-", HzMerge.new("100", "a.,d-", "aEastaugh, Steven R.,d1952-").merge!
|
21
|
+
end
|
22
|
+
|
23
|
+
it "merges non-controlled values" do
|
24
|
+
assert_equal "aNational League for Nursing publication ;vno. 52-1870.", HzMerge.new("830", "a ;vno. 52-1870.", "aNational League for Nursing publication ;").merge!
|
25
|
+
end
|
26
|
+
|
27
|
+
it "handles multiple templated subfield with same code" do
|
28
|
+
assert_equal "aMedical carexUtilizationzMarylandzBaltimore.", HzMerge.new("650", "a x z z.", "aMedical carexUtilizationzMarylandzBaltimore.").merge!
|
29
|
+
end
|
30
|
+
|
31
|
+
it "handles tag 240 weirdness" do
|
32
|
+
assert_equal "aProblemy radiaÙtýsionnoµi genetiki.lEnglish", HzMerge.new("240", "a.l ", "aDubinin, Nikolaµi Petrovich,d1907-1998.tProblemy radiaÙtýsionnoµi genetiki.lEnglish").merge!
|
33
|
+
end
|
34
|
+
|
35
|
+
it "preserves space before semi-colon in 830" do
|
36
|
+
# this is actually something Alpha-G's HznExportMarc does differently
|
37
|
+
# than HIP/Horizon -- we try to stick with HIP/Horizon, not entirely
|
38
|
+
# sure if this is a bug in HIP we're reproducing, maybe there shouldn't
|
39
|
+
# be space before the semi-colon?
|
40
|
+
assert_equal "aActa ophthalmologica.pSupplementum ;v81.", HzMerge.new("830", "a.p ;v81.", "aActa ophthalmologica.pSupplementum").merge!
|
41
|
+
end
|
42
|
+
|
43
|
+
it "handles non-matching ending punct" do
|
44
|
+
# Yes, current HIP behavior, as well as marcout and HznMarcOut, ends in
|
45
|
+
# period. I don't know if it's really right, but we'll match current behavior.
|
46
|
+
assert_equal "aWessel, Rosa,d1897.", HzMerge.new("100", "a,d.", "aWessel, Rosa,d1897-").merge!
|
47
|
+
end
|
48
|
+
|
49
|
+
it "a weird non-matching ending punct" do
|
50
|
+
# in this one, HIP and Alpha-G HznMarcOut actually didn't match! We go with HIP.
|
51
|
+
assert_equal "aGreat Britain.bParliament.tPapers by Command ;vCd. 4671.", HzMerge.new("810", "a.b.t ;vCd. 4671.", "aGreat Britain.bParliament.tPapers by Command.").merge!
|
52
|
+
end
|
53
|
+
|
54
|
+
it "handles weird internal multi punct with spaces" do
|
55
|
+
assert_equal "aMiscellaneous publications (Pan American Sanitary Bureau) ;vno. 79.", HzMerge.new("830", "a) ;vno. 79.", "aMiscellaneous publications (Pan American Sanitary Bureau) ;").merge!
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
gem 'minitest' # I feel like this messes with bundler, but only way to get minitest to shut up
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'minitest/spec'
|
4
|
+
|
5
|
+
require 'traject'
|
6
|
+
require 'marc'
|
7
|
+
|
8
|
+
# keeps things from complaining about "yell-1.4.0/lib/yell/adapters/io.rb:66 warning: syswrite for buffered IO"
|
9
|
+
# for reasons I don't entirely understand, involving yell using syswrite and tests sometimes
|
10
|
+
# using $stderr.puts. https://github.com/TwP/logging/issues/31
|
11
|
+
STDERR.sync = true
|
12
|
+
|
13
|
+
# Hacky way to turn off Indexer logging by default, say only
|
14
|
+
# log things higher than fatal, which is nothing.
|
15
|
+
require 'traject/indexer/settings'
|
16
|
+
Traject::Indexer::Settings.defaults["log.level"] = "gt.fatal"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'traject_horizon/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "traject_horizon"
|
8
|
+
spec.version = TrajectHorizon::VERSION
|
9
|
+
spec.authors = ["Jonathan Rochkind"]
|
10
|
+
spec.email = ["jonathan@dnil.net"]
|
11
|
+
spec.summary = %q{Horizon ILS MARC Exporter, a plugin for the traject tool}
|
12
|
+
spec.homepage = "http://github.com/jrochkind/traject_horizon"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "traject"
|
21
|
+
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
Binary file
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: traject_horizon
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jonathan Rochkind
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-08-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: traject
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - '>='
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '0'
|
21
|
+
none: false
|
22
|
+
requirement: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
none: false
|
28
|
+
prerelease: false
|
29
|
+
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: bundler
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ~>
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '1.3'
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '1.3'
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :development
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rake
|
48
|
+
version_requirements: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
none: false
|
54
|
+
requirement: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
none: false
|
60
|
+
prerelease: false
|
61
|
+
type: :development
|
62
|
+
description:
|
63
|
+
email:
|
64
|
+
- jonathan@dnil.net
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- Gemfile
|
71
|
+
- LICENSE.txt
|
72
|
+
- README.md
|
73
|
+
- Rakefile
|
74
|
+
- lib/traject/horizon_bib_auth_merge.rb
|
75
|
+
- lib/traject/horizon_reader.rb
|
76
|
+
- lib/traject_horizon.rb
|
77
|
+
- lib/traject_horizon/version.rb
|
78
|
+
- test/horizon_bib_auth_merge_test.rb
|
79
|
+
- test/test_helper.rb
|
80
|
+
- traject_horizon.gemspec
|
81
|
+
- vendor/jtds/.DS_Store
|
82
|
+
- vendor/jtds/jtds-1.2.8.jar
|
83
|
+
homepage: http://github.com/jrochkind/traject_horizon
|
84
|
+
licenses:
|
85
|
+
- MIT
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
none: false
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
none: false
|
102
|
+
requirements: []
|
103
|
+
rubyforge_project:
|
104
|
+
rubygems_version: 1.8.24
|
105
|
+
signing_key:
|
106
|
+
specification_version: 3
|
107
|
+
summary: Horizon ILS MARC Exporter, a plugin for the traject tool
|
108
|
+
test_files:
|
109
|
+
- test/horizon_bib_auth_merge_test.rb
|
110
|
+
- test/test_helper.rb
|