high_level_browse 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +198 -0
- data/Rakefile +9 -0
- data/bench/bench.rb +57 -0
- data/bench/hlb.json.gz +0 -0
- data/bin/fetch_new_hlb +62 -0
- data/bin/hlb +46 -0
- data/bin/test_marc_file_for_hlb +122 -0
- data/high_level_browse.gemspec +26 -0
- data/lib/high_level_browse.rb +41 -0
- data/lib/high_level_browse/call_number_range.rb +154 -0
- data/lib/high_level_browse/db.rb +150 -0
- data/lib/high_level_browse/range_tree.rb +90 -0
- data/lib/high_level_browse/version.rb +3 -0
- data/test/minitest_helper.rb +14 -0
- data/test/test_high_level_browse.rb +27 -0
- metadata +138 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 286d0ce64d0d9e8dffa58b716f111d086310654d
|
4
|
+
data.tar.gz: 2a13aad07ee29e47b0bcc00f4ba16740491e9bfd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9960852abc0686da303da11c8ead326df0ea7e7df89432962f7d1353e62350afbc7a3ad556d1beecfe6cce816c1bf654ce4bdee78bb195caefdb08caeb67b7cf
|
7
|
+
data.tar.gz: 3d29b51feb0bd70c37eea28248eff4f3dccd8a38cdb23617be8998fdfa821e392743d0f87c25e9f233ee326b4097072ca63d3d04bcdf5c21216ec43a96ecae04
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bill Dueber
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# HighLevelBrowse
|
2
|
+
|
3
|
+
Given an LC Call Number, try to get a set of academic disciplines associated with it
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
|
9
|
+
use 'high_level_browse'
|
10
|
+
|
11
|
+
# Pull a new version of the raw data from the UM website,
|
12
|
+
# transform it into something that can be quickly searched,
|
13
|
+
# and serialize it to `hlb.json.gz` in the specified directory
|
14
|
+
hlb = HighLevelBrowse.fetch_and_save(dir: '/tmp')
|
15
|
+
|
16
|
+
# ...or just grab an already fetch_and_saved copy
|
17
|
+
hlb = HighLevelBrowse.load(dir: '/tmp')
|
18
|
+
|
19
|
+
# What HLB categories is an LC Call Number in?
|
20
|
+
hlb.topics 'hc 9112.2'
|
21
|
+
# => [["Social Sciences", "Economics"],
|
22
|
+
# ["Social Sciences", "Social Sciences (General)"]]
|
23
|
+
|
24
|
+
# ... or use the #[] shortcut syntax
|
25
|
+
|
26
|
+
hlb['NC1766 .U52 D733 2014']
|
27
|
+
# => [["Arts", "Art History"],
|
28
|
+
# ["Arts", "Art and Design"],
|
29
|
+
# ["Arts", "Film and Video Studies"]]
|
30
|
+
|
31
|
+
# You can also send more than one call number at a time
|
32
|
+
|
33
|
+
hlb.topics('E 99 .S2 Y67 1993', 'PS 3565 .R5734 F67 2015')
|
34
|
+
# => [["Humanities", "American Culture"],
|
35
|
+
# ["Humanities", "United States History"],
|
36
|
+
# ["Social Sciences", "Native American Studies"],
|
37
|
+
# ["Social Sciences", "Archaeology"],
|
38
|
+
# ["Humanities", "English Language and Literature"]]
|
39
|
+
|
40
|
+
```
|
41
|
+
|
42
|
+
|
43
|
+
## Overview
|
44
|
+
|
45
|
+
While we in the library world sometimes use LC Call Numbers (or at least
|
46
|
+
the initial letters) as a proxy for subject matter, the mapping is iffy
|
47
|
+
in many cases and is, in any case, one-dimensional. Many works simply
|
48
|
+
cover multiple subjects or are relevant to sometimes quite different
|
49
|
+
types of academics.
|
50
|
+
|
51
|
+
Take, for example, the chemistry of the brain as it applies to mental
|
52
|
+
illness. We have a book, _Endorphins : new waves in brain chemistry_
|
53
|
+
cataloged as **QP552.E53 D381 1984**. The QP's map to "Phsiology", which
|
54
|
+
is correct but not complete.
|
55
|
+
|
56
|
+
The University of Michigan Library has for years maintained
|
57
|
+
the [High Level Browse](https://www.lib.umich.edu/browse/categories/) (HLB),
|
58
|
+
a mapping of call-number ranges to academic subjects. The entire
|
59
|
+
data set is available as [1.8MB XML file](https://www.lib.umich.edu/browse/categories/xml.php)
|
60
|
+
for download.
|
61
|
+
|
62
|
+
In the HLB, the call number for _Endorphins : new waves in brain chemistry_ maps
|
63
|
+
to the following categories:
|
64
|
+
|
65
|
+
* Science | Physiology
|
66
|
+
* Health Sciences | Physiology
|
67
|
+
* Health Sciences | Public Health (General)
|
68
|
+
* Science | Chemical Engineering
|
69
|
+
* Engineering | Chemical Engineering
|
70
|
+
* Health Sciences | Biological Chemistry
|
71
|
+
* Science | Chemistry | Biological Chemistry
|
72
|
+
|
73
|
+
This opens up potentially more accurate categorization of works for, say,
|
74
|
+
faceting in a library catalog.
|
75
|
+
|
76
|
+
This gem gives a relatively time-efficient way to get the set of disciplines associated
|
77
|
+
with the given callnumber or callnumbers as part of indexing MARC records into Solr.
|
78
|
+
This mapping is used in many places in the University Library at the University of
|
79
|
+
Michigan, including the
|
80
|
+
[Mirlyn Catalog](https://mirlyn.lib.umich.edu/)
|
81
|
+
(exposed as "Academic Discipline" in the facets) and ejournals/databases (and even
|
82
|
+
Librarians!) via the [Browse page](https://www.lib.umich.edu/browse).
|
83
|
+
|
84
|
+
This categorization may be useful for clustering/faceting
|
85
|
+
in similar applications at other institutions. Note that the actual creation and
|
86
|
+
maintenance of the call number ranges is done by subject specialist librarians and
|
87
|
+
is out of scope for this gem.
|
88
|
+
|
89
|
+
## Command line utilities: `fetch_new_hlb` and `hlb`
|
90
|
+
|
91
|
+
There are also a couple command line applications for managing and querying the
|
92
|
+
data.
|
93
|
+
|
94
|
+
* **fetch_new_hlb** tries to grab a new copy of the data from the umich website
|
95
|
+
and serialize it to a ~500k file called `hlb.json.gz` in the given directory.
|
96
|
+
Useful for putting in a cron job to periodically update with fresh data
|
97
|
+
|
98
|
+
```bash
|
99
|
+
|
100
|
+
$> fetch_new_hlb
|
101
|
+
|
102
|
+
fetch_new_hlb -- get a new copy of the HLB ready for use by high_level_browse
|
103
|
+
and stick it in the given directory
|
104
|
+
|
105
|
+
Usage: fetch_new_hlb <dir>
|
106
|
+
```
|
107
|
+
|
108
|
+
* **hlb** takes one or more callnumbers and returns a text display of the categories
|
109
|
+
associated with them. It will stash a copy of the database in `Dir.tmpdir`if there
|
110
|
+
isn't one there already, and use it on subsequent calls so things aren't so
|
111
|
+
desperately slow. (To find your tmpdir, in your shell
|
112
|
+
run `ruby -e 'require "tmpdir"; puts Dir.tmpdir'`)
|
113
|
+
|
114
|
+
|
115
|
+
```bash
|
116
|
+
$> hlb
|
117
|
+
|
118
|
+
hlb -- get high level browse data for an LC call number
|
119
|
+
|
120
|
+
Example:
|
121
|
+
hlb "qa 11.33 .C4 .H3"
|
122
|
+
or do several at once
|
123
|
+
hlb "PN 33.4" "AC 1122.3 .C22" ...
|
124
|
+
|
125
|
+
# Let's try it
|
126
|
+
$> hlb "qa 11.33 .C4"
|
127
|
+
|
128
|
+
Science | Mathematics
|
129
|
+
Social Sciences | Education
|
130
|
+
|
131
|
+
```
|
132
|
+
|
133
|
+
|
134
|
+
## A warning about (lack of) coverage
|
135
|
+
|
136
|
+
Note that not every possible valid callnumber will be necessarily be contained in any
|
137
|
+
dicipline at all. Many books aren't academic in nature, and even then
|
138
|
+
coverage is known to have some holes. Some of the ranges cover essentially a
|
139
|
+
single book in the umich collection. And, of course, not every record is going
|
140
|
+
to have a LC Call Number, so there's that.
|
141
|
+
|
142
|
+
This is all to say: this may or may not be useful at your insitution. You'll
|
143
|
+
have to experiment.
|
144
|
+
|
145
|
+
To help with this, there's a little script in the `bin/` directory called
|
146
|
+
`test_marc_file_for_hlb` which will, when given a MARC-XML file (ending in `.xml`)
|
147
|
+
or a MARC-binary file (ending in anything else), output some statistics on
|
148
|
+
what kind of coverage you would get. It might be useful to send a test file
|
149
|
+
through there to see what comes up. It looks in the `050` and the `852[h]` to
|
150
|
+
see if anything pops, but you can make it looks elsewhere pretty easily.
|
151
|
+
|
152
|
+
It produces something like this:
|
153
|
+
|
154
|
+
```
|
155
|
+
050 fields
|
156
|
+
9790 total
|
157
|
+
209 not recognized as LC call numbers
|
158
|
+
9337 with at least one HLB category
|
159
|
+
244 with NO category
|
160
|
+
|
161
|
+
Of 17642 records,
|
162
|
+
9677 (54.85%) had a field that often contains an LC Call Number
|
163
|
+
9262 (95.71%) of *those* had at least one HLB category
|
164
|
+
|
165
|
+
```
|
166
|
+
|
167
|
+
## Performance
|
168
|
+
|
169
|
+
On my laptop under normal load (e.g., not very scientific at all)
|
170
|
+
I get the following running in a single thread
|
171
|
+
|
172
|
+
```
|
173
|
+
ruby 2.3 this gem ~8500 lookups/second
|
174
|
+
ruby 2.4 this gem ~9100 lookups/second
|
175
|
+
jruby 9 this gem ~20,000 lookups/second
|
176
|
+
jruby 9, old HLB.jar ~6500 lookups/second
|
177
|
+
jruby 1.7 this gem error, can't do named arguments since it's 1.9 mode
|
178
|
+
jruby 1.7 old HLB.jar ~6700 lookups/second
|
179
|
+
```
|
180
|
+
|
181
|
+
The [old HLB.jar](https://github.com/billdueber/HLB-Java) refers to a pure java version that I call from within
|
182
|
+
Jruby as part of my catalog indexing process now. Ithas a different (worse) algorithm, but is of
|
183
|
+
interest because it's what I'm writing this to replace.
|
184
|
+
|
185
|
+
## Installation
|
186
|
+
|
187
|
+
```bash
|
188
|
+
gem 'high_level_browse'
|
189
|
+
```
|
190
|
+
|
191
|
+
|
192
|
+
## Contributing
|
193
|
+
|
194
|
+
1. Fork it ( https://github.com/[my-github-username]/high_level_browse/fork )
|
195
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
196
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
197
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
198
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bench/bench.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'benchmark/ips'
|
2
|
+
$:.unshift '../lib'
|
3
|
+
$:.unshift '.'
|
4
|
+
|
5
|
+
|
6
|
+
# On my laptop under normal load (e.g., not very scientific at all)
|
7
|
+
# I get the following running in a single thread
|
8
|
+
# ruby 2.3 ~8500 lookups/second
|
9
|
+
# ruby 2.4 ~9100 lookups/second
|
10
|
+
# jruby 9 ~20k lookups/second
|
11
|
+
# jruby 9, old HLB.jar ~6500 lookups/second
|
12
|
+
# jruby 1.7 error, can't do named arguments
|
13
|
+
# jruby 1.7, old HLB.jar ~6700 lookups/second
|
14
|
+
#
|
15
|
+
# The old HLB.jar has a different (worse) algorithm, but is of
|
16
|
+
# interest because it's what I'm writing this to replace.
|
17
|
+
|
18
|
+
# umich_traject holds .jar files with the old java implementation; see
|
19
|
+
# https://github.com/hathitrust/ht_traject/tree/9e8d414fd9bb2c79e243d289c4d39c05d2de27e5/lib/umich_traject
|
20
|
+
#
|
21
|
+
|
22
|
+
TEST_OLD_STUFF = defined? JRUBY_VERSION and Dir.exist?('./umich_traject')
|
23
|
+
if TEST_OLD_STUFF
|
24
|
+
puts "Loading old HLB3.jar stuff"
|
25
|
+
require 'umich_traject/jackson-core-asl-1.4.3.jar'
|
26
|
+
require 'umich_traject/jackson-mapper-asl-1.4.3.jar'
|
27
|
+
require 'umich_traject/apache-solr-umichnormalizers.jar'
|
28
|
+
require 'umich_traject/HLB3.jar'
|
29
|
+
java_import Java::edu.umich.lib.hlb::HLB
|
30
|
+
puts "Initializing HLB"
|
31
|
+
HLB.initialize()
|
32
|
+
end
|
33
|
+
|
34
|
+
require 'high_level_browse'
|
35
|
+
|
36
|
+
h = HighLevelBrowse.load(dir: '.')
|
37
|
+
|
38
|
+
cns = File.read('call_numbers.txt').split(/\n/).cycle
|
39
|
+
|
40
|
+
puts RUBY_DESCRIPTION
|
41
|
+
|
42
|
+
total = 0
|
43
|
+
Benchmark.ips do |x|
|
44
|
+
x.config(:time => 25, :warmup => 25)
|
45
|
+
|
46
|
+
x.report("HLB lookups") do
|
47
|
+
total += h[cns.next].count
|
48
|
+
end
|
49
|
+
|
50
|
+
if TEST_OLD_STUFF
|
51
|
+
total = 0
|
52
|
+
x.report("Old java lookups") do
|
53
|
+
total += HLB.categories(cns.next).to_a.count
|
54
|
+
end
|
55
|
+
x.compare!
|
56
|
+
end
|
57
|
+
end
|
data/bench/hlb.json.gz
ADDED
Binary file
|
data/bin/fetch_new_hlb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# If we're loading from source instead of a gem, rubygems
|
4
|
+
# isn't setting load paths for us, so we need to set it ourselves
|
5
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
6
|
+
unless $LOAD_PATH.include? self_load_path
|
7
|
+
$LOAD_PATH << self_load_path
|
8
|
+
end
|
9
|
+
|
10
|
+
def silence_warnings(&block)
|
11
|
+
warn_level = $VERBOSE
|
12
|
+
$VERBOSE = nil
|
13
|
+
result = block.call
|
14
|
+
$VERBOSE = warn_level
|
15
|
+
result
|
16
|
+
end
|
17
|
+
|
18
|
+
# minitest has a circular require warning, which
|
19
|
+
# drives me crazy. Suppress it.
|
20
|
+
silence_warnings do
|
21
|
+
require 'high_level_browse'
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'fileutils'
|
25
|
+
|
26
|
+
def putsmsg(msg)
|
27
|
+
puts "---------------------------------------------------"
|
28
|
+
puts " ERROR: #{msg}"
|
29
|
+
puts "---------------------------------------------------"
|
30
|
+
puts
|
31
|
+
end
|
32
|
+
|
33
|
+
def usage(msg = nil)
|
34
|
+
puts
|
35
|
+
putsmsg(msg) if msg
|
36
|
+
puts "fetch_new_hlb -- get a new copy of the HLB ready for use by high_level_browse"
|
37
|
+
puts "and stick it in the given directory"
|
38
|
+
puts
|
39
|
+
puts " Usage: fetch_new_hlb <dir>"
|
40
|
+
puts
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
|
44
|
+
unless ARGV.size == 1
|
45
|
+
usage
|
46
|
+
end
|
47
|
+
|
48
|
+
dir = ARGV.shift
|
49
|
+
|
50
|
+
File.exist? dir or usage "#{dir} does not exist"
|
51
|
+
Dir.exist? dir or usage "#{dir} is not a directory"
|
52
|
+
File.writable? dir or usage "#{dir} is not writable"
|
53
|
+
|
54
|
+
begin
|
55
|
+
db = HighLevelBrowse.fetch
|
56
|
+
db.save(dir: dir)
|
57
|
+
rescue => e
|
58
|
+
puts "============================="
|
59
|
+
puts "ERROR FETCHING HLB SOURCE"
|
60
|
+
puts " #{e}"
|
61
|
+
puts "============================="
|
62
|
+
end
|
data/bin/hlb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Hmmm. How to pass along the location? Stick one in /tmp
|
4
|
+
# and see if it exists?
|
5
|
+
|
6
|
+
|
7
|
+
def usage
|
8
|
+
puts "hlb -- get high level browse data for an LC call number"
|
9
|
+
puts
|
10
|
+
puts %Q{Example:\n hlb "qa 11.33 .C4 .H3"}
|
11
|
+
puts " or do several at once"
|
12
|
+
puts %Q{ hlb "PN 33.4" "AC 1122.3 .C22" ... }
|
13
|
+
puts
|
14
|
+
exit(1)
|
15
|
+
end
|
16
|
+
|
17
|
+
usage if ARGV.empty?
|
18
|
+
|
19
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
20
|
+
unless $LOAD_PATH.include? self_load_path
|
21
|
+
$LOAD_PATH << self_load_path
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'high_level_browse'
|
25
|
+
require 'fileutils'
|
26
|
+
require 'tmpdir'
|
27
|
+
|
28
|
+
filename = HighLevelBrowse::DB::FILENAME
|
29
|
+
dir = Dir.tmpdir()
|
30
|
+
fullpath = File.join(dir, filename)
|
31
|
+
|
32
|
+
hlb = if File.exist?(fullpath)
|
33
|
+
HighLevelBrowse.load(dir: dir)
|
34
|
+
else
|
35
|
+
STDERR.puts "Fetching raw data from UMich; wait a sec"
|
36
|
+
HighLevelBrowse.fetch_and_save(dir: dir)
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
topics = hlb[*ARGV]
|
41
|
+
|
42
|
+
if topics.empty?
|
43
|
+
puts "\nNo categories found for #{ARGV}\n\n"
|
44
|
+
else
|
45
|
+
puts "\n" + topics.map { |x| x.join(' | ') }.join("\n") + "\n\n"
|
46
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
4
|
+
unless $LOAD_PATH.include? self_load_path
|
5
|
+
$LOAD_PATH << self_load_path
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'marc'
|
9
|
+
require 'high_level_browse'
|
10
|
+
require 'lcsort'
|
11
|
+
require 'tmpdir'
|
12
|
+
|
13
|
+
|
14
|
+
filename = ARGV[0]
|
15
|
+
|
16
|
+
reader = if filename =~ /xml\Z/i
|
17
|
+
MARC::XMLReader.new(filename)
|
18
|
+
else
|
19
|
+
MARC::Reader.new(filename)
|
20
|
+
end
|
21
|
+
|
22
|
+
Counter = Struct.new(:count, :invalid, :found, :notfound, :hlb) do
|
23
|
+
|
24
|
+
def update(cn)
|
25
|
+
self.count += 1
|
26
|
+
case check_cn(cn)
|
27
|
+
when :invalid
|
28
|
+
self.invalid += 1
|
29
|
+
0
|
30
|
+
when :found
|
31
|
+
self.found += 1
|
32
|
+
1
|
33
|
+
when :notfound
|
34
|
+
self.notfound += 1
|
35
|
+
0
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def check_cn(cn)
|
41
|
+
normalized = Lcsort.normalize(cn)
|
42
|
+
return :invalid if normalized.nil?
|
43
|
+
cats = hlb[cn]
|
44
|
+
if cats.empty?
|
45
|
+
:notfound
|
46
|
+
else
|
47
|
+
:found
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def puts_pretty_output
|
52
|
+
puts '%9d total' % count
|
53
|
+
puts '%9d not recognized as LC call numbers' % invalid
|
54
|
+
puts '%9d with at least one HLB category' % found
|
55
|
+
puts '%9d with NO category' % notfound
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def puts_output(f050, f852)
|
61
|
+
puts "050 fields"
|
62
|
+
f050.puts_pretty_output
|
63
|
+
puts "\n852h fields"
|
64
|
+
f852.puts_pretty_output
|
65
|
+
end
|
66
|
+
|
67
|
+
puts "Fetching/parsing HLB XML file"
|
68
|
+
filename = HighLevelBrowse::DB::FILENAME
|
69
|
+
dir = Dir.tmpdir()
|
70
|
+
fullpath = File.join(dir, filename)
|
71
|
+
|
72
|
+
hlb = if File.exist?(fullpath)
|
73
|
+
puts "Using file at #{fullpath}"
|
74
|
+
HighLevelBrowse.load(dir: dir)
|
75
|
+
else
|
76
|
+
HighLevelBrowse.fetch_and_save(dir: dir)
|
77
|
+
end
|
78
|
+
|
79
|
+
f050 = Counter.new(0, 0, 0, 0, hlb)
|
80
|
+
f852 = Counter.new(0, 0, 0, 0, hlb)
|
81
|
+
records = 0
|
82
|
+
matched_records = 0
|
83
|
+
possible_records = 0
|
84
|
+
puts "Beginning analysis of marc records with 2k record progress reports"
|
85
|
+
reader.each do |r|
|
86
|
+
records += 1
|
87
|
+
found = 0
|
88
|
+
possible = false
|
89
|
+
puts '%8d records processed so far' % records if records % 2_000 == 0
|
90
|
+
if r['050']
|
91
|
+
cns = r.fields('050').map { |x| x.map(&:value).join('') }
|
92
|
+
cns.each do |cn|
|
93
|
+
found += f050.update(cn)
|
94
|
+
possible = true
|
95
|
+
end
|
96
|
+
cns = r.fields('852').keep_if { |x| x['h'] }.map { |x| x['h'] }
|
97
|
+
cns.each do |cn|
|
98
|
+
found += f852.update(cn)
|
99
|
+
possible = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
matched_records += 1 if found > 0
|
103
|
+
possible_records += 1 if possible
|
104
|
+
end
|
105
|
+
|
106
|
+
puts "\n\n"
|
107
|
+
puts_output(f050, f852)
|
108
|
+
puts format(
|
109
|
+
%Q[\nOf %d records,
|
110
|
+
%d (%4.2f%%) had a field that often contains an LC Call Number
|
111
|
+
%d (%4.2f%%) of *those* had at least one HLB category],
|
112
|
+
records,
|
113
|
+
possible_records,
|
114
|
+
possible_records.to_f / records * 100,
|
115
|
+
matched_records,
|
116
|
+
matched_records.to_f / possible_records * 100)
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'high_level_browse/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "high_level_browse"
|
8
|
+
spec.version = HighLevelBrowse::VERSION
|
9
|
+
spec.authors = ["Bill Dueber"]
|
10
|
+
spec.email = ["bill@dueber.com"]
|
11
|
+
spec.summary = %q{Map LC call numbers to academic categories.}
|
12
|
+
spec.homepage = ""
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency 'oga', '~> 2.1'
|
21
|
+
spec.add_dependency 'lcsort'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency "minitest"
|
26
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "high_level_browse/version"
|
2
|
+
require 'high_level_browse/db'
|
3
|
+
require 'uri'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
module HighLevelBrowse
|
7
|
+
|
8
|
+
SOURCE_URL = ENV['HLB_XML_ENDPOINT'] || 'https://www.lib.umich.edu/browse/categories/xml.php'
|
9
|
+
|
10
|
+
# Fetch a new version of the raw file and turn it into a db
|
11
|
+
# @return [DB] The loaded database
|
12
|
+
def self.fetch
|
13
|
+
uri = URI.parse(SOURCE_URL)
|
14
|
+
# Why on earth OpenURI::OpenRead is mixed into http but not https, I don't know
|
15
|
+
uri.extend OpenURI::OpenRead
|
16
|
+
|
17
|
+
xml = uri.read
|
18
|
+
return DB.new_from_xml(xml)
|
19
|
+
rescue => e
|
20
|
+
raise "Could not fetch xml from '#{SOURCE_URL}': #{e}"
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# Fetch and save to the specified directory
|
25
|
+
# @param [String] dir The directory where the hlb.json.gz file will end up
|
26
|
+
# @return [DB] The fetched and saved database
|
27
|
+
def self.fetch_and_save(dir:)
|
28
|
+
db = self.fetch
|
29
|
+
db.save(dir: dir)
|
30
|
+
db
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
# Load from disk
|
35
|
+
# @param [String] dir The directory where the hlb.json.gz file is located
|
36
|
+
# @return [DB] The loaded database
|
37
|
+
def self.load(dir:)
|
38
|
+
DB.load(dir: dir)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'lcsort'
|
2
|
+
require 'high_level_browse/range_tree'
|
3
|
+
|
4
|
+
|
5
|
+
# An efficient set of CallNumberRanges from which to get topics
|
6
|
+
class HighLevelBrowse::CallNumberRangeSet < HighLevelBrowse::RangeTree
|
7
|
+
|
8
|
+
|
9
|
+
# Returns the array of topic arrays for the given LC string
|
10
|
+
# @param [String] raw_lc A raw LC string (eg., 'qa 112.3 .A4 1990')
|
11
|
+
# @return [Array<Array<String>>] Arrays of topic labels
|
12
|
+
def topics_for(raw_lc)
|
13
|
+
normalized = Lcsort.normalize(HighLevelBrowse::CallNumberRange.preprocess(raw_lc))
|
14
|
+
self.search(normalized).map(&:topic_array).uniq
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
# A callnumber-range keeps track of the original begin/end
|
20
|
+
# strings as well as the normalized versions, and can be
|
21
|
+
# serialized to JSON
|
22
|
+
|
23
|
+
class HighLevelBrowse::CallNumberRange
|
24
|
+
include Comparable
|
25
|
+
|
26
|
+
attr_reader :min, :max, :min_raw, :max_raw, :firstletter
|
27
|
+
|
28
|
+
|
29
|
+
attr_accessor :topic_array, :redundant
|
30
|
+
|
31
|
+
SPACE_OR_PUNCT = /\A[\s\p{Punct}]*(.*?)[\s\p{Punct}]*\Z/
|
32
|
+
DIGIT_TO_LETTER = /(\d)([A-Z])/i
|
33
|
+
|
34
|
+
# @nodoc
|
35
|
+
# Remove spaces/punctuation from the ends of the string
|
36
|
+
def self.strip_spaces_and_punct(str)
|
37
|
+
str.gsub(SPACE_OR_PUNCT, '\1')
|
38
|
+
end
|
39
|
+
|
40
|
+
# @nodoc
|
41
|
+
# Force a space between any digit->letter transition
|
42
|
+
def self.force_break_between_digit_and_letter(str)
|
43
|
+
str.gsub(DIGIT_TO_LETTER, '\1 \2')
|
44
|
+
end
|
45
|
+
# @nodoc
|
46
|
+
# Preprocess the string, removing spaces/punctuation off the end
|
47
|
+
# and forcing a space where there's a digit->letter transition
|
48
|
+
def self.preprocess(str)
|
49
|
+
str ||= ''
|
50
|
+
force_break_between_digit_and_letter(
|
51
|
+
strip_spaces_and_punct(str)
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def initialize(min:, max:, topic_array:)
|
57
|
+
@illegal = false
|
58
|
+
@redundant = false
|
59
|
+
self.min = self.class.preprocess(min)
|
60
|
+
self.max = self.class.preprocess(max)
|
61
|
+
@topic_array = topic_array
|
62
|
+
@firstletter = self.min[0] unless @illegal
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# Compare based on @min, then end
|
67
|
+
# @param [CallNumberRange] o the range to compare to
|
68
|
+
def <=>(o)
|
69
|
+
[self.min, self.max] <=> [o.min, o.max]
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_s
|
73
|
+
"[#{self.min_raw} - #{self.max_raw}]"
|
74
|
+
end
|
75
|
+
|
76
|
+
def reconstitute(min, max, min_raw, max_raw, firstletter, topic_array)
|
77
|
+
@min = min
|
78
|
+
@max = max
|
79
|
+
@min_raw = min_raw
|
80
|
+
@max_raw = max_raw
|
81
|
+
@firstletter = firstletter
|
82
|
+
@topic_array = topic_array
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Two ranges are equal if their @min, @max, and topic array
|
87
|
+
# are all the same
|
88
|
+
# @param [CallNumberRange] o the range to compare to
|
89
|
+
def ==(other)
|
90
|
+
@min == other.min and
|
91
|
+
@max == other.max and
|
92
|
+
@topic_array == other.topic_array
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
# @nodoc
|
97
|
+
# JSON roundtrip
|
98
|
+
def to_json(*a)
|
99
|
+
{
|
100
|
+
'json_class' => self.class.name,
|
101
|
+
'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
102
|
+
}.to_json(*a)
|
103
|
+
end
|
104
|
+
|
105
|
+
# @nodoc
|
106
|
+
def self.json_create(h)
|
107
|
+
cnr = self.allocate
|
108
|
+
cnr.reconstitute(*(h['data']))
|
109
|
+
cnr
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# In both @min= and end=, we also rescue any parsing errors
|
114
|
+
# and simply set the @illegal flag so we can use it later on.
|
115
|
+
def min=(x)
|
116
|
+
@min_raw = x
|
117
|
+
possible_min = Lcsort.normalize(x)
|
118
|
+
if possible_min.nil? # didn't normalize
|
119
|
+
@illegal = true
|
120
|
+
nil
|
121
|
+
else
|
122
|
+
@min = possible_min
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Same as start. Set the illegal flag if we get an error
|
127
|
+
def max=(x)
|
128
|
+
@max_raw = x
|
129
|
+
possible_max = Lcsort.normalize(x)
|
130
|
+
if possible_max.nil? # didn't normalize
|
131
|
+
@illegal = true
|
132
|
+
nil
|
133
|
+
else
|
134
|
+
@max = possible_max + '~' # add a tilde to make it a true endpoint
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def illegal?
|
139
|
+
@illegal
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
def surrounds(other)
|
144
|
+
@min <= other.min and @max >= other.max
|
145
|
+
end
|
146
|
+
|
147
|
+
def contains(x)
|
148
|
+
@min <= x and @max >= x
|
149
|
+
end
|
150
|
+
|
151
|
+
alias_method :cover?, :contains
|
152
|
+
alias_method :member?, :contains
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'high_level_browse/call_number_range'
|
3
|
+
require 'zlib'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
class HighLevelBrowse::DB
|
7
|
+
|
8
|
+
# Hard-code filename. If you need more than one, put them
|
9
|
+
# in different directories
|
10
|
+
FILENAME = 'hlb.json.gz'
|
11
|
+
|
12
|
+
# Given a bunch of CallNumberRange objects, create a new
|
13
|
+
# database with an efficient structure for querying
|
14
|
+
# @param [Array<HighLevelBrowse::CallNumberRange>] array_of_ranges
|
15
|
+
def initialize(array_of_ranges)
|
16
|
+
@all = array_of_ranges
|
17
|
+
@ranges = self.create_letter_indexed_ranges(@all)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Given an array of ranges, create efficient
|
21
|
+
# search structures
|
22
|
+
# @private
|
23
|
+
def create_letter_indexed_ranges(all)
|
24
|
+
bins = {}
|
25
|
+
('A'..'Z').each do |letter|
|
26
|
+
cnrs = all.find_all {|x| x.firstletter == letter}
|
27
|
+
bins[letter] = HighLevelBrowse::CallNumberRangeSet.new(cnrs)
|
28
|
+
end
|
29
|
+
bins
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the topic arrays associated with this callnumber
|
33
|
+
# of the form:
|
34
|
+
# [
|
35
|
+
# [toplevel, secondlevel],
|
36
|
+
# [toplevel, secondlevel, thirdlevel],
|
37
|
+
# ...
|
38
|
+
# ]
|
39
|
+
# @param [String] raw_callnumber_string
|
40
|
+
# @return [Array<Array>] A (possibly empty) array of arrays of topics
|
41
|
+
def topics(*raw_callnumber_strings)
|
42
|
+
raw_callnumber_strings.reduce([]) do |acc, raw_callnumber_string|
|
43
|
+
firstletter = raw_callnumber_string.strip.upcase[0]
|
44
|
+
if @ranges.has_key? firstletter
|
45
|
+
acc + @ranges[firstletter].topics_for(raw_callnumber_string)
|
46
|
+
else
|
47
|
+
acc
|
48
|
+
end
|
49
|
+
end.uniq
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
alias_method :[], :topics
|
54
|
+
|
55
|
+
# Create a new object from a string with the XML
|
56
|
+
# in it.
|
57
|
+
# @param [String] xml The contents of the HLB XML dump
|
58
|
+
# (e.g., from 'https://www.lib.umich.edu/browse/categories/xml.php')
|
59
|
+
# @return [DB]
|
60
|
+
def self.new_from_xml(xml)
|
61
|
+
oga_doc_root = Oga.parse_xml(xml)
|
62
|
+
simple_array_of_cnrs = cnrs_within_oga_node(node: oga_doc_root)
|
63
|
+
self.new(simple_array_of_cnrs).freeze
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
# Save to disk
|
68
|
+
# @param [String] dir The directory where the hlb.json.gz file will be saved
|
69
|
+
# @return [DB] The loaded database
|
70
|
+
def save(dir:)
|
71
|
+
Zlib::GzipWriter.open(File.join(dir, FILENAME)) do |out|
|
72
|
+
out.puts JSON.fast_generate(@all)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Load from disk
|
78
|
+
# @param [String] dir The directory where the hlb.json.gz file is located
|
79
|
+
# @return [DB] The loaded database
|
80
|
+
def self.load(dir:)
|
81
|
+
simple_array_of_cnrs = Zlib::GzipReader.open(File.join(dir, FILENAME)) do |infile|
|
82
|
+
JSON.load(infile.read).to_a
|
83
|
+
end
|
84
|
+
db = self.new(simple_array_of_cnrs)
|
85
|
+
db.freeze
|
86
|
+
db
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Freeze everything
|
91
|
+
# @return [DB] the frozen db
|
92
|
+
def freeze
|
93
|
+
@ranges.freeze
|
94
|
+
@all.freeze
|
95
|
+
self
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
# Recurse through the parsed XML document, at each stage keeping track of
|
101
|
+
# * where we are (what are the xpath children?)
|
102
|
+
# * what the current topics are ([level1, level2])
|
103
|
+
# Get all the call numbers assocaited with the topic represented by the given node,
|
104
|
+
# as well as all the children of the given node, and send it back as a big ol' array
|
105
|
+
# @param [Oga::Node] node A node of the parsed HLB XML file
|
106
|
+
# @param [Array<String>] decendent_xpaths A list of xpaths to the decendents of this node
|
107
|
+
# @param [Array<String>] topic_array An array with all levels of the topics associated with this node
|
108
|
+
# @return [Array<HighLevelBrowse::CallNumberRange>]
|
109
|
+
def self.cnrs_within_oga_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
|
110
|
+
if decendent_xpaths.empty?
|
111
|
+
[] # base case -- we're as low as we're going to go
|
112
|
+
else
|
113
|
+
current_xpath_component = decendent_xpaths[0]
|
114
|
+
new_xpath = decendent_xpaths[1..-1]
|
115
|
+
new_topic = topic_array.dup
|
116
|
+
new_topic.push node.get(:name) unless node == node.root_node # skip the root
|
117
|
+
cnrs = []
|
118
|
+
# For each sub-component, get both the call-number-ranges (cnrs) assocaited
|
119
|
+
# with this level, as well as recusively getting from all the children
|
120
|
+
node.xpath(current_xpath_component).each do |c|
|
121
|
+
cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
|
122
|
+
cnrs += cnrs_within_oga_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
|
123
|
+
end
|
124
|
+
cnrs
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# Given a second-to-lowest-level node, get its topic and
|
130
|
+
# extract call number ranges from its children
|
131
|
+
def self.call_numbers_list_from_leaves(node:, topic_array:)
|
132
|
+
cnrs = []
|
133
|
+
new_topic = topic_array.dup.push node.get(:name)
|
134
|
+
node.xpath('call-numbers').each do |cn_node|
|
135
|
+
min = cn_node.get(:start)
|
136
|
+
max = cn_node.get(:end)
|
137
|
+
|
138
|
+
new_cnr = HighLevelBrowse::CallNumberRange.new(min: min, max: max, topic_array: new_topic)
|
139
|
+
if new_cnr.illegal?
|
140
|
+
# do some sort of logging
|
141
|
+
else
|
142
|
+
cnrs.push new_cnr
|
143
|
+
end
|
144
|
+
end
|
145
|
+
cnrs
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# Never released as a gem, as near as I can tell.
|
2
|
+
# Taken from https://github.com/clearhaus/range-tree,
|
3
|
+
# which was released under the MIT license
|
4
|
+
# by ClearHaus (https://www.clearhaus.com/)
|
5
|
+
|
6
|
+
# Namespaced to avoid conflicts with other range_tree
|
7
|
+
# gems
|
8
|
+
|
9
|
+
module HighLevelBrowse
|
10
|
+
class RangeTree
|
11
|
+
class Node
|
12
|
+
def initialize(left, range, right, min, max)
|
13
|
+
@left = left
|
14
|
+
@range = range
|
15
|
+
@right = right
|
16
|
+
@min = min || range.min
|
17
|
+
@max = max || range.max
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :left, :range, :right, :min, :max
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(ranges, sorted: false)
|
24
|
+
# ranges.sort_by! {|r| [r.min, r.max]} unless sorted
|
25
|
+
# It's only required to be sorted by `r.min`, but if many ranges has the
|
26
|
+
# same left endpoint, then it's more efficient if also secondarily sorted by
|
27
|
+
# the right endpoint (or equivalently by the length).
|
28
|
+
|
29
|
+
@root = RangeTree.split(ranges.sort{|a,b| (a.min <=> b.min) || (a.max <=> b.max)})
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :root
|
33
|
+
|
34
|
+
def self.split(ranges)
|
35
|
+
return nil if ranges.empty?
|
36
|
+
|
37
|
+
middle = ranges.length/2
|
38
|
+
|
39
|
+
left = split(ranges.slice(0, middle)) # Handle middle == 0 correctly.
|
40
|
+
range = ranges[middle] # Current range.
|
41
|
+
right = split(ranges[(middle+1)..-1]) # Handle middle == ranges.length correctly.
|
42
|
+
|
43
|
+
ary = [left, range, right].compact
|
44
|
+
|
45
|
+
Node.new(left, range, right,
|
46
|
+
ary.map(&:min).min, # Subtree's min.
|
47
|
+
ary.map(&:max).max) # Subtree's max.
|
48
|
+
end
|
49
|
+
|
50
|
+
def search(range, limit: Float::INFINITY)
|
51
|
+
range = range.is_a?(Range) ? range : (range..range)
|
52
|
+
|
53
|
+
result = []
|
54
|
+
RangeTree.search_helper(range, @root, result, limit)
|
55
|
+
|
56
|
+
result
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.search_helper(q, root, result, limit)
|
60
|
+
return if root.nil?
|
61
|
+
|
62
|
+
# Visit left child?
|
63
|
+
if (l = root.left) and l.max and q.min and \
|
64
|
+
not l.max < q.min # The interesting part.
|
65
|
+
search_helper(q, root.left, result, limit)
|
66
|
+
end
|
67
|
+
|
68
|
+
return if result.length >= limit
|
69
|
+
# Yes, it needs to be checked here rather than in the top. Otherwise, at the
|
70
|
+
# point of checking, there wasn't added too many, but after left child has
|
71
|
+
# been checked, we might hit the limit and then, "this" will add one as
|
72
|
+
# well.
|
73
|
+
|
74
|
+
# Add root?
|
75
|
+
result << root.range if RangeTree.ranges_intersect?(q, root.range)
|
76
|
+
|
77
|
+
# Visit right child?
|
78
|
+
if (r = root.right) and q.max and r.min and \
|
79
|
+
not q.max < r.min # The interesting part.
|
80
|
+
search_helper(q, root.right, result, limit)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.ranges_intersect?(a, b)
|
85
|
+
return false unless a.min && a.max && b.min && b.max
|
86
|
+
|
87
|
+
a.min <= b.max && a.max >= b.min
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
2
|
+
|
3
|
+
# Both oga and minitest have stupid warnings that I don't want to
|
4
|
+
# hear about
|
5
|
+
|
6
|
+
verbose = $VERBOSE
|
7
|
+
$VERBOSE = nil
|
8
|
+
require 'oga'
|
9
|
+
require 'minitest'
|
10
|
+
require 'minitest/spec'
|
11
|
+
require 'minitest/autorun'
|
12
|
+
$VERBOSE = verbose
|
13
|
+
|
14
|
+
require 'high_level_browse'
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
TESTDIR = File.expand_path(File.dirname(__FILE__))
|
5
|
+
|
6
|
+
describe "loads" do
|
7
|
+
it "loads" do
|
8
|
+
assert true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "has a version" do
|
12
|
+
HighLevelBrowse::VERSION.wont_be_nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Works the same as before" do
|
17
|
+
it "gets the same output for 30k randomly chosen call numbers" do
|
18
|
+
h = HighLevelBrowse.fetch_and_save(dir: TESTDIR)
|
19
|
+
JSON.load(File.open(File.join(TESTDIR, '30k_random_old_mappings.json'))).each do |rec|
|
20
|
+
cn = rec['cn'].strip
|
21
|
+
newcats = h[cn]
|
22
|
+
next if rec['jar'].empty?
|
23
|
+
assert_equal [cn, rec['jar'].sort], [rec['cn'], newcats.sort]
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: high_level_browse
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Bill Dueber
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-06-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: oga
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: lcsort
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- bill@dueber.com
|
86
|
+
executables:
|
87
|
+
- fetch_new_hlb
|
88
|
+
- hlb
|
89
|
+
- test_marc_file_for_hlb
|
90
|
+
extensions: []
|
91
|
+
extra_rdoc_files: []
|
92
|
+
files:
|
93
|
+
- ".gitignore"
|
94
|
+
- ".travis.yml"
|
95
|
+
- Gemfile
|
96
|
+
- LICENSE.txt
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- bench/bench.rb
|
100
|
+
- bench/hlb.json.gz
|
101
|
+
- bin/fetch_new_hlb
|
102
|
+
- bin/hlb
|
103
|
+
- bin/test_marc_file_for_hlb
|
104
|
+
- high_level_browse.gemspec
|
105
|
+
- lib/high_level_browse.rb
|
106
|
+
- lib/high_level_browse/call_number_range.rb
|
107
|
+
- lib/high_level_browse/db.rb
|
108
|
+
- lib/high_level_browse/range_tree.rb
|
109
|
+
- lib/high_level_browse/version.rb
|
110
|
+
- test/minitest_helper.rb
|
111
|
+
- test/test_high_level_browse.rb
|
112
|
+
homepage: ''
|
113
|
+
licenses:
|
114
|
+
- MIT
|
115
|
+
metadata: {}
|
116
|
+
post_install_message:
|
117
|
+
rdoc_options: []
|
118
|
+
require_paths:
|
119
|
+
- lib
|
120
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
requirements: []
|
131
|
+
rubyforge_project:
|
132
|
+
rubygems_version: 2.6.8
|
133
|
+
signing_key:
|
134
|
+
specification_version: 4
|
135
|
+
summary: Map LC call numbers to academic categories.
|
136
|
+
test_files:
|
137
|
+
- test/minitest_helper.rb
|
138
|
+
- test/test_high_level_browse.rb
|