high_level_browse 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +198 -0
- data/Rakefile +9 -0
- data/bench/bench.rb +57 -0
- data/bench/hlb.json.gz +0 -0
- data/bin/fetch_new_hlb +62 -0
- data/bin/hlb +46 -0
- data/bin/test_marc_file_for_hlb +122 -0
- data/high_level_browse.gemspec +26 -0
- data/lib/high_level_browse.rb +41 -0
- data/lib/high_level_browse/call_number_range.rb +154 -0
- data/lib/high_level_browse/db.rb +150 -0
- data/lib/high_level_browse/range_tree.rb +90 -0
- data/lib/high_level_browse/version.rb +3 -0
- data/test/minitest_helper.rb +14 -0
- data/test/test_high_level_browse.rb +27 -0
- metadata +138 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 286d0ce64d0d9e8dffa58b716f111d086310654d
|
4
|
+
data.tar.gz: 2a13aad07ee29e47b0bcc00f4ba16740491e9bfd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9960852abc0686da303da11c8ead326df0ea7e7df89432962f7d1353e62350afbc7a3ad556d1beecfe6cce816c1bf654ce4bdee78bb195caefdb08caeb67b7cf
|
7
|
+
data.tar.gz: 3d29b51feb0bd70c37eea28248eff4f3dccd8a38cdb23617be8998fdfa821e392743d0f87c25e9f233ee326b4097072ca63d3d04bcdf5c21216ec43a96ecae04
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bill Dueber
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# HighLevelBrowse
|
2
|
+
|
3
|
+
Given an LC Call Number, try to get a set of academic disciplines associated with it
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
|
9
|
+
use 'high_level_browse'
|
10
|
+
|
11
|
+
# Pull a new version of the raw data from the UM website,
|
12
|
+
# transform it into something that can be quickly searched,
|
13
|
+
# and serialize it to `hlb.json.gz` in the specified directory
|
14
|
+
hlb = HighLevelBrowse.fetch_and_save(dir: '/tmp')
|
15
|
+
|
16
|
+
# ...or just grab an already fetch_and_saved copy
|
17
|
+
hlb = HighLevelBrowse.load(dir: '/tmp')
|
18
|
+
|
19
|
+
# What HLB categories is an LC Call Number in?
|
20
|
+
hlb.topics 'hc 9112.2'
|
21
|
+
# => [["Social Sciences", "Economics"],
|
22
|
+
# ["Social Sciences", "Social Sciences (General)"]]
|
23
|
+
|
24
|
+
# ... or use the #[] shortcut syntax
|
25
|
+
|
26
|
+
hlb['NC1766 .U52 D733 2014']
|
27
|
+
# => [["Arts", "Art History"],
|
28
|
+
# ["Arts", "Art and Design"],
|
29
|
+
# ["Arts", "Film and Video Studies"]]
|
30
|
+
|
31
|
+
# You can also send more than one call number at a time
|
32
|
+
|
33
|
+
hlb.topics('E 99 .S2 Y67 1993', 'PS 3565 .R5734 F67 2015')
|
34
|
+
# => [["Humanities", "American Culture"],
|
35
|
+
# ["Humanities", "United States History"],
|
36
|
+
# ["Social Sciences", "Native American Studies"],
|
37
|
+
# ["Social Sciences", "Archaeology"],
|
38
|
+
# ["Humanities", "English Language and Literature"]]
|
39
|
+
|
40
|
+
```
|
41
|
+
|
42
|
+
|
43
|
+
## Overview
|
44
|
+
|
45
|
+
While we in the library world sometimes use LC Call Numbers (or at least
|
46
|
+
the initial letters) as a proxy for subject matter, the mapping is iffy
|
47
|
+
in many cases and is, in any case, one-dimensional. Many works simply
|
48
|
+
cover multiple subjects or are relevant to sometimes quite different
|
49
|
+
types of academics.
|
50
|
+
|
51
|
+
Take, for example, the chemistry of the brain as it applies to mental
|
52
|
+
illness. We have a book, _Endorphins : new waves in brain chemistry_
|
53
|
+
cataloged as **QP552.E53 D381 1984**. The QP's map to "Phsiology", which
|
54
|
+
is correct but not complete.
|
55
|
+
|
56
|
+
The University of Michigan Library has for years maintained
|
57
|
+
the [High Level Browse](https://www.lib.umich.edu/browse/categories/) (HLB),
|
58
|
+
a mapping of call-number ranges to academic subjects. The entire
|
59
|
+
data set is available as [1.8MB XML file](https://www.lib.umich.edu/browse/categories/xml.php)
|
60
|
+
for download.
|
61
|
+
|
62
|
+
In the HLB, the call number for _Endorphins : new waves in brain chemistry_ maps
|
63
|
+
to the following categories:
|
64
|
+
|
65
|
+
* Science | Physiology
|
66
|
+
* Health Sciences | Physiology
|
67
|
+
* Health Sciences | Public Health (General)
|
68
|
+
* Science | Chemical Engineering
|
69
|
+
* Engineering | Chemical Engineering
|
70
|
+
* Health Sciences | Biological Chemistry
|
71
|
+
* Science | Chemistry | Biological Chemistry
|
72
|
+
|
73
|
+
This opens up potentially more accurate categorization of works for, say,
|
74
|
+
faceting in a library catalog.
|
75
|
+
|
76
|
+
This gem gives a relatively time-efficient way to get the set of disciplines associated
|
77
|
+
with the given callnumber or callnumbers as part of indexing MARC records into Solr.
|
78
|
+
This mapping is used in many places in the University Library at the University of
|
79
|
+
Michigan, including the
|
80
|
+
[Mirlyn Catalog](https://mirlyn.lib.umich.edu/)
|
81
|
+
(exposed as "Academic Discipline" in the facets) and ejournals/databases (and even
|
82
|
+
Librarians!) via the [Browse page](https://www.lib.umich.edu/browse).
|
83
|
+
|
84
|
+
This categorization may be useful for clustering/faceting
|
85
|
+
in similar applications at other institutions. Note that the actual creation and
|
86
|
+
maintenance of the call number ranges is done by subject specialist librarians and
|
87
|
+
is out of scope for this gem.
|
88
|
+
|
89
|
+
## Command line utilities: `fetch_new_hlb` and `hlb`
|
90
|
+
|
91
|
+
There are also a couple command line applications for managing and querying the
|
92
|
+
data.
|
93
|
+
|
94
|
+
* **fetch_new_hlb** tries to grab a new copy of the data from the umich website
|
95
|
+
and serialize it to a ~500k file called `hlb.json.gz` in the given directory.
|
96
|
+
Useful for putting in a cron job to periodically update with fresh data
|
97
|
+
|
98
|
+
```bash
|
99
|
+
|
100
|
+
$> fetch_new_hlb
|
101
|
+
|
102
|
+
fetch_new_hlb -- get a new copy of the HLB ready for use by high_level_browse
|
103
|
+
and stick it in the given directory
|
104
|
+
|
105
|
+
Usage: fetch_new_hlb <dir>
|
106
|
+
```
|
107
|
+
|
108
|
+
* **hlb** takes one or more callnumbers and returns a text display of the categories
|
109
|
+
associated with them. It will stash a copy of the database in `Dir.tmpdir`if there
|
110
|
+
isn't one there already, and use it on subsequent calls so things aren't so
|
111
|
+
desperately slow. (To find your tmpdir, in your shell
|
112
|
+
run `ruby -e 'require "tmpdir"; puts Dir.tmpdir'`)
|
113
|
+
|
114
|
+
|
115
|
+
```bash
|
116
|
+
$> hlb
|
117
|
+
|
118
|
+
hlb -- get high level browse data for an LC call number
|
119
|
+
|
120
|
+
Example:
|
121
|
+
hlb "qa 11.33 .C4 .H3"
|
122
|
+
or do several at once
|
123
|
+
hlb "PN 33.4" "AC 1122.3 .C22" ...
|
124
|
+
|
125
|
+
# Let's try it
|
126
|
+
$> hlb "qa 11.33 .C4"
|
127
|
+
|
128
|
+
Science | Mathematics
|
129
|
+
Social Sciences | Education
|
130
|
+
|
131
|
+
```
|
132
|
+
|
133
|
+
|
134
|
+
## A warning about (lack of) coverage
|
135
|
+
|
136
|
+
Note that not every possible valid callnumber will be necessarily be contained in any
|
137
|
+
dicipline at all. Many books aren't academic in nature, and even then
|
138
|
+
coverage is known to have some holes. Some of the ranges cover essentially a
|
139
|
+
single book in the umich collection. And, of course, not every record is going
|
140
|
+
to have a LC Call Number, so there's that.
|
141
|
+
|
142
|
+
This is all to say: this may or may not be useful at your insitution. You'll
|
143
|
+
have to experiment.
|
144
|
+
|
145
|
+
To help with this, there's a little script in the `bin/` directory called
|
146
|
+
`test_marc_file_for_hlb` which will, when given a MARC-XML file (ending in `.xml`)
|
147
|
+
or a MARC-binary file (ending in anything else), output some statistics on
|
148
|
+
what kind of coverage you would get. It might be useful to send a test file
|
149
|
+
through there to see what comes up. It looks in the `050` and the `852[h]` to
|
150
|
+
see if anything pops, but you can make it looks elsewhere pretty easily.
|
151
|
+
|
152
|
+
It produces something like this:
|
153
|
+
|
154
|
+
```
|
155
|
+
050 fields
|
156
|
+
9790 total
|
157
|
+
209 not recognized as LC call numbers
|
158
|
+
9337 with at least one HLB category
|
159
|
+
244 with NO category
|
160
|
+
|
161
|
+
Of 17642 records,
|
162
|
+
9677 (54.85%) had a field that often contains an LC Call Number
|
163
|
+
9262 (95.71%) of *those* had at least one HLB category
|
164
|
+
|
165
|
+
```
|
166
|
+
|
167
|
+
## Performance
|
168
|
+
|
169
|
+
On my laptop under normal load (e.g., not very scientific at all)
|
170
|
+
I get the following running in a single thread
|
171
|
+
|
172
|
+
```
|
173
|
+
ruby 2.3 this gem ~8500 lookups/second
|
174
|
+
ruby 2.4 this gem ~9100 lookups/second
|
175
|
+
jruby 9 this gem ~20,000 lookups/second
|
176
|
+
jruby 9, old HLB.jar ~6500 lookups/second
|
177
|
+
jruby 1.7 this gem error, can't do named arguments since it's 1.9 mode
|
178
|
+
jruby 1.7 old HLB.jar ~6700 lookups/second
|
179
|
+
```
|
180
|
+
|
181
|
+
The [old HLB.jar](https://github.com/billdueber/HLB-Java) refers to a pure java version that I call from within
|
182
|
+
Jruby as part of my catalog indexing process now. Ithas a different (worse) algorithm, but is of
|
183
|
+
interest because it's what I'm writing this to replace.
|
184
|
+
|
185
|
+
## Installation
|
186
|
+
|
187
|
+
```bash
|
188
|
+
gem 'high_level_browse'
|
189
|
+
```
|
190
|
+
|
191
|
+
|
192
|
+
## Contributing
|
193
|
+
|
194
|
+
1. Fork it ( https://github.com/[my-github-username]/high_level_browse/fork )
|
195
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
196
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
197
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
198
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bench/bench.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'benchmark/ips'
|
2
|
+
$:.unshift '../lib'
|
3
|
+
$:.unshift '.'
|
4
|
+
|
5
|
+
|
6
|
+
# On my laptop under normal load (e.g., not very scientific at all)
|
7
|
+
# I get the following running in a single thread
|
8
|
+
# ruby 2.3 ~8500 lookups/second
|
9
|
+
# ruby 2.4 ~9100 lookups/second
|
10
|
+
# jruby 9 ~20k lookups/second
|
11
|
+
# jruby 9, old HLB.jar ~6500 lookups/second
|
12
|
+
# jruby 1.7 error, can't do named arguments
|
13
|
+
# jruby 1.7, old HLB.jar ~6700 lookups/second
|
14
|
+
#
|
15
|
+
# The old HLB.jar has a different (worse) algorithm, but is of
|
16
|
+
# interest because it's what I'm writing this to replace.
|
17
|
+
|
18
|
+
# umich_traject holds .jar files with the old java implementation; see
|
19
|
+
# https://github.com/hathitrust/ht_traject/tree/9e8d414fd9bb2c79e243d289c4d39c05d2de27e5/lib/umich_traject
|
20
|
+
#
|
21
|
+
|
22
|
+
TEST_OLD_STUFF = defined? JRUBY_VERSION and Dir.exist?('./umich_traject')
|
23
|
+
if TEST_OLD_STUFF
|
24
|
+
puts "Loading old HLB3.jar stuff"
|
25
|
+
require 'umich_traject/jackson-core-asl-1.4.3.jar'
|
26
|
+
require 'umich_traject/jackson-mapper-asl-1.4.3.jar'
|
27
|
+
require 'umich_traject/apache-solr-umichnormalizers.jar'
|
28
|
+
require 'umich_traject/HLB3.jar'
|
29
|
+
java_import Java::edu.umich.lib.hlb::HLB
|
30
|
+
puts "Initializing HLB"
|
31
|
+
HLB.initialize()
|
32
|
+
end
|
33
|
+
|
34
|
+
require 'high_level_browse'
|
35
|
+
|
36
|
+
h = HighLevelBrowse.load(dir: '.')
|
37
|
+
|
38
|
+
cns = File.read('call_numbers.txt').split(/\n/).cycle
|
39
|
+
|
40
|
+
puts RUBY_DESCRIPTION
|
41
|
+
|
42
|
+
total = 0
|
43
|
+
Benchmark.ips do |x|
|
44
|
+
x.config(:time => 25, :warmup => 25)
|
45
|
+
|
46
|
+
x.report("HLB lookups") do
|
47
|
+
total += h[cns.next].count
|
48
|
+
end
|
49
|
+
|
50
|
+
if TEST_OLD_STUFF
|
51
|
+
total = 0
|
52
|
+
x.report("Old java lookups") do
|
53
|
+
total += HLB.categories(cns.next).to_a.count
|
54
|
+
end
|
55
|
+
x.compare!
|
56
|
+
end
|
57
|
+
end
|
data/bench/hlb.json.gz
ADDED
Binary file
|
data/bin/fetch_new_hlb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# If we're loading from source instead of a gem, rubygems
|
4
|
+
# isn't setting load paths for us, so we need to set it ourselves
|
5
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
6
|
+
unless $LOAD_PATH.include? self_load_path
|
7
|
+
$LOAD_PATH << self_load_path
|
8
|
+
end
|
9
|
+
|
10
|
+
def silence_warnings(&block)
|
11
|
+
warn_level = $VERBOSE
|
12
|
+
$VERBOSE = nil
|
13
|
+
result = block.call
|
14
|
+
$VERBOSE = warn_level
|
15
|
+
result
|
16
|
+
end
|
17
|
+
|
18
|
+
# minitest has a circular require warning, which
|
19
|
+
# drives me crazy. Suppress it.
|
20
|
+
silence_warnings do
|
21
|
+
require 'high_level_browse'
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'fileutils'
|
25
|
+
|
26
|
+
def putsmsg(msg)
|
27
|
+
puts "---------------------------------------------------"
|
28
|
+
puts " ERROR: #{msg}"
|
29
|
+
puts "---------------------------------------------------"
|
30
|
+
puts
|
31
|
+
end
|
32
|
+
|
33
|
+
def usage(msg = nil)
|
34
|
+
puts
|
35
|
+
putsmsg(msg) if msg
|
36
|
+
puts "fetch_new_hlb -- get a new copy of the HLB ready for use by high_level_browse"
|
37
|
+
puts "and stick it in the given directory"
|
38
|
+
puts
|
39
|
+
puts " Usage: fetch_new_hlb <dir>"
|
40
|
+
puts
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
|
44
|
+
unless ARGV.size == 1
|
45
|
+
usage
|
46
|
+
end
|
47
|
+
|
48
|
+
dir = ARGV.shift
|
49
|
+
|
50
|
+
File.exist? dir or usage "#{dir} does not exist"
|
51
|
+
Dir.exist? dir or usage "#{dir} is not a directory"
|
52
|
+
File.writable? dir or usage "#{dir} is not writable"
|
53
|
+
|
54
|
+
begin
|
55
|
+
db = HighLevelBrowse.fetch
|
56
|
+
db.save(dir: dir)
|
57
|
+
rescue => e
|
58
|
+
puts "============================="
|
59
|
+
puts "ERROR FETCHING HLB SOURCE"
|
60
|
+
puts " #{e}"
|
61
|
+
puts "============================="
|
62
|
+
end
|
data/bin/hlb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Hmmm. How to pass along the location? Stick one in /tmp
|
4
|
+
# and see if it exists?
|
5
|
+
|
6
|
+
|
7
|
+
def usage
|
8
|
+
puts "hlb -- get high level browse data for an LC call number"
|
9
|
+
puts
|
10
|
+
puts %Q{Example:\n hlb "qa 11.33 .C4 .H3"}
|
11
|
+
puts " or do several at once"
|
12
|
+
puts %Q{ hlb "PN 33.4" "AC 1122.3 .C22" ... }
|
13
|
+
puts
|
14
|
+
exit(1)
|
15
|
+
end
|
16
|
+
|
17
|
+
usage if ARGV.empty?
|
18
|
+
|
19
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
20
|
+
unless $LOAD_PATH.include? self_load_path
|
21
|
+
$LOAD_PATH << self_load_path
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'high_level_browse'
|
25
|
+
require 'fileutils'
|
26
|
+
require 'tmpdir'
|
27
|
+
|
28
|
+
filename = HighLevelBrowse::DB::FILENAME
|
29
|
+
dir = Dir.tmpdir()
|
30
|
+
fullpath = File.join(dir, filename)
|
31
|
+
|
32
|
+
hlb = if File.exist?(fullpath)
|
33
|
+
HighLevelBrowse.load(dir: dir)
|
34
|
+
else
|
35
|
+
STDERR.puts "Fetching raw data from UMich; wait a sec"
|
36
|
+
HighLevelBrowse.fetch_and_save(dir: dir)
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
topics = hlb[*ARGV]
|
41
|
+
|
42
|
+
if topics.empty?
|
43
|
+
puts "\nNo categories found for #{ARGV}\n\n"
|
44
|
+
else
|
45
|
+
puts "\n" + topics.map { |x| x.join(' | ') }.join("\n") + "\n\n"
|
46
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
4
|
+
unless $LOAD_PATH.include? self_load_path
|
5
|
+
$LOAD_PATH << self_load_path
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'marc'
|
9
|
+
require 'high_level_browse'
|
10
|
+
require 'lcsort'
|
11
|
+
require 'tmpdir'
|
12
|
+
|
13
|
+
|
14
|
+
filename = ARGV[0]
|
15
|
+
|
16
|
+
reader = if filename =~ /xml\Z/i
|
17
|
+
MARC::XMLReader.new(filename)
|
18
|
+
else
|
19
|
+
MARC::Reader.new(filename)
|
20
|
+
end
|
21
|
+
|
22
|
+
Counter = Struct.new(:count, :invalid, :found, :notfound, :hlb) do
|
23
|
+
|
24
|
+
def update(cn)
|
25
|
+
self.count += 1
|
26
|
+
case check_cn(cn)
|
27
|
+
when :invalid
|
28
|
+
self.invalid += 1
|
29
|
+
0
|
30
|
+
when :found
|
31
|
+
self.found += 1
|
32
|
+
1
|
33
|
+
when :notfound
|
34
|
+
self.notfound += 1
|
35
|
+
0
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def check_cn(cn)
|
41
|
+
normalized = Lcsort.normalize(cn)
|
42
|
+
return :invalid if normalized.nil?
|
43
|
+
cats = hlb[cn]
|
44
|
+
if cats.empty?
|
45
|
+
:notfound
|
46
|
+
else
|
47
|
+
:found
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def puts_pretty_output
|
52
|
+
puts '%9d total' % count
|
53
|
+
puts '%9d not recognized as LC call numbers' % invalid
|
54
|
+
puts '%9d with at least one HLB category' % found
|
55
|
+
puts '%9d with NO category' % notfound
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def puts_output(f050, f852)
|
61
|
+
puts "050 fields"
|
62
|
+
f050.puts_pretty_output
|
63
|
+
puts "\n852h fields"
|
64
|
+
f852.puts_pretty_output
|
65
|
+
end
|
66
|
+
|
67
|
+
puts "Fetching/parsing HLB XML file"
|
68
|
+
filename = HighLevelBrowse::DB::FILENAME
|
69
|
+
dir = Dir.tmpdir()
|
70
|
+
fullpath = File.join(dir, filename)
|
71
|
+
|
72
|
+
hlb = if File.exist?(fullpath)
|
73
|
+
puts "Using file at #{fullpath}"
|
74
|
+
HighLevelBrowse.load(dir: dir)
|
75
|
+
else
|
76
|
+
HighLevelBrowse.fetch_and_save(dir: dir)
|
77
|
+
end
|
78
|
+
|
79
|
+
f050 = Counter.new(0, 0, 0, 0, hlb)
|
80
|
+
f852 = Counter.new(0, 0, 0, 0, hlb)
|
81
|
+
records = 0
|
82
|
+
matched_records = 0
|
83
|
+
possible_records = 0
|
84
|
+
puts "Beginning analysis of marc records with 2k record progress reports"
|
85
|
+
reader.each do |r|
|
86
|
+
records += 1
|
87
|
+
found = 0
|
88
|
+
possible = false
|
89
|
+
puts '%8d records processed so far' % records if records % 2_000 == 0
|
90
|
+
if r['050']
|
91
|
+
cns = r.fields('050').map { |x| x.map(&:value).join('') }
|
92
|
+
cns.each do |cn|
|
93
|
+
found += f050.update(cn)
|
94
|
+
possible = true
|
95
|
+
end
|
96
|
+
cns = r.fields('852').keep_if { |x| x['h'] }.map { |x| x['h'] }
|
97
|
+
cns.each do |cn|
|
98
|
+
found += f852.update(cn)
|
99
|
+
possible = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
matched_records += 1 if found > 0
|
103
|
+
possible_records += 1 if possible
|
104
|
+
end
|
105
|
+
|
106
|
+
puts "\n\n"
|
107
|
+
puts_output(f050, f852)
|
108
|
+
puts format(
|
109
|
+
%Q[\nOf %d records,
|
110
|
+
%d (%4.2f%%) had a field that often contains an LC Call Number
|
111
|
+
%d (%4.2f%%) of *those* had at least one HLB category],
|
112
|
+
records,
|
113
|
+
possible_records,
|
114
|
+
possible_records.to_f / records * 100,
|
115
|
+
matched_records,
|
116
|
+
matched_records.to_f / possible_records * 100)
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'high_level_browse/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "high_level_browse"
|
8
|
+
spec.version = HighLevelBrowse::VERSION
|
9
|
+
spec.authors = ["Bill Dueber"]
|
10
|
+
spec.email = ["bill@dueber.com"]
|
11
|
+
spec.summary = %q{Map LC call numbers to academic categories.}
|
12
|
+
spec.homepage = ""
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency 'oga', '~> 2.1'
|
21
|
+
spec.add_dependency 'lcsort'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency "minitest"
|
26
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "high_level_browse/version"
|
2
|
+
require 'high_level_browse/db'
|
3
|
+
require 'uri'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
module HighLevelBrowse
|
7
|
+
|
8
|
+
SOURCE_URL = ENV['HLB_XML_ENDPOINT'] || 'https://www.lib.umich.edu/browse/categories/xml.php'
|
9
|
+
|
10
|
+
# Fetch a new version of the raw file and turn it into a db
|
11
|
+
# @return [DB] The loaded database
|
12
|
+
def self.fetch
|
13
|
+
uri = URI.parse(SOURCE_URL)
|
14
|
+
# Why on earth OpenURI::OpenRead is mixed into http but not https, I don't know
|
15
|
+
uri.extend OpenURI::OpenRead
|
16
|
+
|
17
|
+
xml = uri.read
|
18
|
+
return DB.new_from_xml(xml)
|
19
|
+
rescue => e
|
20
|
+
raise "Could not fetch xml from '#{SOURCE_URL}': #{e}"
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# Fetch and save to the specified directory
|
25
|
+
# @param [String] dir The directory where the hlb.json.gz file will end up
|
26
|
+
# @return [DB] The fetched and saved database
|
27
|
+
def self.fetch_and_save(dir:)
|
28
|
+
db = self.fetch
|
29
|
+
db.save(dir: dir)
|
30
|
+
db
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
# Load from disk
|
35
|
+
# @param [String] dir The directory where the hlb.json.gz file is located
|
36
|
+
# @return [DB] The loaded database
|
37
|
+
def self.load(dir:)
|
38
|
+
DB.load(dir: dir)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'lcsort'
|
2
|
+
require 'high_level_browse/range_tree'
|
3
|
+
|
4
|
+
|
5
|
+
# An efficient set of CallNumberRanges from which to get topics
|
6
|
+
class HighLevelBrowse::CallNumberRangeSet < HighLevelBrowse::RangeTree
|
7
|
+
|
8
|
+
|
9
|
+
# Returns the array of topic arrays for the given LC string
|
10
|
+
# @param [String] raw_lc A raw LC string (eg., 'qa 112.3 .A4 1990')
|
11
|
+
# @return [Array<Array<String>>] Arrays of topic labels
|
12
|
+
def topics_for(raw_lc)
|
13
|
+
normalized = Lcsort.normalize(HighLevelBrowse::CallNumberRange.preprocess(raw_lc))
|
14
|
+
self.search(normalized).map(&:topic_array).uniq
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
# A callnumber-range keeps track of the original begin/end
|
20
|
+
# strings as well as the normalized versions, and can be
|
21
|
+
# serialized to JSON
|
22
|
+
|
23
|
+
class HighLevelBrowse::CallNumberRange
|
24
|
+
include Comparable
|
25
|
+
|
26
|
+
attr_reader :min, :max, :min_raw, :max_raw, :firstletter
|
27
|
+
|
28
|
+
|
29
|
+
attr_accessor :topic_array, :redundant
|
30
|
+
|
31
|
+
SPACE_OR_PUNCT = /\A[\s\p{Punct}]*(.*?)[\s\p{Punct}]*\Z/
|
32
|
+
DIGIT_TO_LETTER = /(\d)([A-Z])/i
|
33
|
+
|
34
|
+
# @nodoc
|
35
|
+
# Remove spaces/punctuation from the ends of the string
|
36
|
+
def self.strip_spaces_and_punct(str)
|
37
|
+
str.gsub(SPACE_OR_PUNCT, '\1')
|
38
|
+
end
|
39
|
+
|
40
|
+
# @nodoc
|
41
|
+
# Force a space between any digit->letter transition
|
42
|
+
def self.force_break_between_digit_and_letter(str)
|
43
|
+
str.gsub(DIGIT_TO_LETTER, '\1 \2')
|
44
|
+
end
|
45
|
+
# @nodoc
|
46
|
+
# Preprocess the string, removing spaces/punctuation off the end
|
47
|
+
# and forcing a space where there's a digit->letter transition
|
48
|
+
def self.preprocess(str)
|
49
|
+
str ||= ''
|
50
|
+
force_break_between_digit_and_letter(
|
51
|
+
strip_spaces_and_punct(str)
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def initialize(min:, max:, topic_array:)
|
57
|
+
@illegal = false
|
58
|
+
@redundant = false
|
59
|
+
self.min = self.class.preprocess(min)
|
60
|
+
self.max = self.class.preprocess(max)
|
61
|
+
@topic_array = topic_array
|
62
|
+
@firstletter = self.min[0] unless @illegal
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# Compare based on @min, then end
|
67
|
+
# @param [CallNumberRange] o the range to compare to
|
68
|
+
def <=>(o)
|
69
|
+
[self.min, self.max] <=> [o.min, o.max]
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_s
|
73
|
+
"[#{self.min_raw} - #{self.max_raw}]"
|
74
|
+
end
|
75
|
+
|
76
|
+
def reconstitute(min, max, min_raw, max_raw, firstletter, topic_array)
|
77
|
+
@min = min
|
78
|
+
@max = max
|
79
|
+
@min_raw = min_raw
|
80
|
+
@max_raw = max_raw
|
81
|
+
@firstletter = firstletter
|
82
|
+
@topic_array = topic_array
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Two ranges are equal if their @min, @max, and topic array
|
87
|
+
# are all the same
|
88
|
+
# @param [CallNumberRange] o the range to compare to
|
89
|
+
def ==(other)
|
90
|
+
@min == other.min and
|
91
|
+
@max == other.max and
|
92
|
+
@topic_array == other.topic_array
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
# @nodoc
|
97
|
+
# JSON roundtrip
|
98
|
+
def to_json(*a)
|
99
|
+
{
|
100
|
+
'json_class' => self.class.name,
|
101
|
+
'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
102
|
+
}.to_json(*a)
|
103
|
+
end
|
104
|
+
|
105
|
+
# @nodoc
|
106
|
+
def self.json_create(h)
|
107
|
+
cnr = self.allocate
|
108
|
+
cnr.reconstitute(*(h['data']))
|
109
|
+
cnr
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# In both @min= and end=, we also rescue any parsing errors
|
114
|
+
# and simply set the @illegal flag so we can use it later on.
|
115
|
+
def min=(x)
|
116
|
+
@min_raw = x
|
117
|
+
possible_min = Lcsort.normalize(x)
|
118
|
+
if possible_min.nil? # didn't normalize
|
119
|
+
@illegal = true
|
120
|
+
nil
|
121
|
+
else
|
122
|
+
@min = possible_min
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Same as start. Set the illegal flag if we get an error
|
127
|
+
def max=(x)
|
128
|
+
@max_raw = x
|
129
|
+
possible_max = Lcsort.normalize(x)
|
130
|
+
if possible_max.nil? # didn't normalize
|
131
|
+
@illegal = true
|
132
|
+
nil
|
133
|
+
else
|
134
|
+
@max = possible_max + '~' # add a tilde to make it a true endpoint
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def illegal?
|
139
|
+
@illegal
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
def surrounds(other)
|
144
|
+
@min <= other.min and @max >= other.max
|
145
|
+
end
|
146
|
+
|
147
|
+
def contains(x)
|
148
|
+
@min <= x and @max >= x
|
149
|
+
end
|
150
|
+
|
151
|
+
alias_method :cover?, :contains
|
152
|
+
alias_method :member?, :contains
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'high_level_browse/call_number_range'
|
3
|
+
require 'zlib'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
class HighLevelBrowse::DB
|
7
|
+
|
8
|
+
# Hard-code filename. If you need more than one, put them
|
9
|
+
# in different directories
|
10
|
+
FILENAME = 'hlb.json.gz'
|
11
|
+
|
12
|
+
# Given a bunch of CallNumberRange objects, create a new
|
13
|
+
# database with an efficient structure for querying
|
14
|
+
# @param [Array<HighLevelBrowse::CallNumberRange>] array_of_ranges
|
15
|
+
def initialize(array_of_ranges)
|
16
|
+
@all = array_of_ranges
|
17
|
+
@ranges = self.create_letter_indexed_ranges(@all)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Given an array of ranges, create efficient
|
21
|
+
# search structures
|
22
|
+
# @private
|
23
|
+
def create_letter_indexed_ranges(all)
|
24
|
+
bins = {}
|
25
|
+
('A'..'Z').each do |letter|
|
26
|
+
cnrs = all.find_all {|x| x.firstletter == letter}
|
27
|
+
bins[letter] = HighLevelBrowse::CallNumberRangeSet.new(cnrs)
|
28
|
+
end
|
29
|
+
bins
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the topic arrays associated with this callnumber
|
33
|
+
# of the form:
|
34
|
+
# [
|
35
|
+
# [toplevel, secondlevel],
|
36
|
+
# [toplevel, secondlevel, thirdlevel],
|
37
|
+
# ...
|
38
|
+
# ]
|
39
|
+
# @param [String] raw_callnumber_string
|
40
|
+
# @return [Array<Array>] A (possibly empty) array of arrays of topics
|
41
|
+
def topics(*raw_callnumber_strings)
|
42
|
+
raw_callnumber_strings.reduce([]) do |acc, raw_callnumber_string|
|
43
|
+
firstletter = raw_callnumber_string.strip.upcase[0]
|
44
|
+
if @ranges.has_key? firstletter
|
45
|
+
acc + @ranges[firstletter].topics_for(raw_callnumber_string)
|
46
|
+
else
|
47
|
+
acc
|
48
|
+
end
|
49
|
+
end.uniq
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
alias_method :[], :topics
|
54
|
+
|
55
|
+
# Create a new object from a string with the XML
|
56
|
+
# in it.
|
57
|
+
# @param [String] xml The contents of the HLB XML dump
|
58
|
+
# (e.g., from 'https://www.lib.umich.edu/browse/categories/xml.php')
|
59
|
+
# @return [DB]
|
60
|
+
def self.new_from_xml(xml)
|
61
|
+
oga_doc_root = Oga.parse_xml(xml)
|
62
|
+
simple_array_of_cnrs = cnrs_within_oga_node(node: oga_doc_root)
|
63
|
+
self.new(simple_array_of_cnrs).freeze
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
# Save to disk
|
68
|
+
# @param [String] dir The directory where the hlb.json.gz file will be saved
|
69
|
+
# @return [DB] The loaded database
|
70
|
+
def save(dir:)
|
71
|
+
Zlib::GzipWriter.open(File.join(dir, FILENAME)) do |out|
|
72
|
+
out.puts JSON.fast_generate(@all)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Load from disk
|
78
|
+
# @param [String] dir The directory where the hlb.json.gz file is located
|
79
|
+
# @return [DB] The loaded database
|
80
|
+
def self.load(dir:)
|
81
|
+
simple_array_of_cnrs = Zlib::GzipReader.open(File.join(dir, FILENAME)) do |infile|
|
82
|
+
JSON.load(infile.read).to_a
|
83
|
+
end
|
84
|
+
db = self.new(simple_array_of_cnrs)
|
85
|
+
db.freeze
|
86
|
+
db
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Freeze everything
|
91
|
+
# @return [DB] the frozen db
|
92
|
+
def freeze
|
93
|
+
@ranges.freeze
|
94
|
+
@all.freeze
|
95
|
+
self
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
# Recurse through the parsed XML document, at each stage keeping track of
|
101
|
+
# * where we are (what are the xpath children?)
|
102
|
+
# * what the current topics are ([level1, level2])
|
103
|
+
# Get all the call numbers assocaited with the topic represented by the given node,
|
104
|
+
# as well as all the children of the given node, and send it back as a big ol' array
|
105
|
+
# @param [Oga::Node] node A node of the parsed HLB XML file
|
106
|
+
# @param [Array<String>] decendent_xpaths A list of xpaths to the decendents of this node
|
107
|
+
# @param [Array<String>] topic_array An array with all levels of the topics associated with this node
|
108
|
+
# @return [Array<HighLevelBrowse::CallNumberRange>]
|
109
|
+
def self.cnrs_within_oga_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
|
110
|
+
if decendent_xpaths.empty?
|
111
|
+
[] # base case -- we're as low as we're going to go
|
112
|
+
else
|
113
|
+
current_xpath_component = decendent_xpaths[0]
|
114
|
+
new_xpath = decendent_xpaths[1..-1]
|
115
|
+
new_topic = topic_array.dup
|
116
|
+
new_topic.push node.get(:name) unless node == node.root_node # skip the root
|
117
|
+
cnrs = []
|
118
|
+
# For each sub-component, get both the call-number-ranges (cnrs) assocaited
|
119
|
+
# with this level, as well as recusively getting from all the children
|
120
|
+
node.xpath(current_xpath_component).each do |c|
|
121
|
+
cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
|
122
|
+
cnrs += cnrs_within_oga_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
|
123
|
+
end
|
124
|
+
cnrs
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# Given a second-to-lowest-level node, get its topic and
|
130
|
+
# extract call number ranges from its children
|
131
|
+
def self.call_numbers_list_from_leaves(node:, topic_array:)
|
132
|
+
cnrs = []
|
133
|
+
new_topic = topic_array.dup.push node.get(:name)
|
134
|
+
node.xpath('call-numbers').each do |cn_node|
|
135
|
+
min = cn_node.get(:start)
|
136
|
+
max = cn_node.get(:end)
|
137
|
+
|
138
|
+
new_cnr = HighLevelBrowse::CallNumberRange.new(min: min, max: max, topic_array: new_topic)
|
139
|
+
if new_cnr.illegal?
|
140
|
+
# do some sort of logging
|
141
|
+
else
|
142
|
+
cnrs.push new_cnr
|
143
|
+
end
|
144
|
+
end
|
145
|
+
cnrs
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# Never released as a gem, as near as I can tell.
|
2
|
+
# Taken from https://github.com/clearhaus/range-tree,
|
3
|
+
# which was released under the MIT license
|
4
|
+
# by ClearHaus (https://www.clearhaus.com/)
|
5
|
+
|
6
|
+
# Namespaced to avoid conflicts with other range_tree
|
7
|
+
# gems
|
8
|
+
|
9
|
+
module HighLevelBrowse
|
10
|
+
class RangeTree
|
11
|
+
class Node
|
12
|
+
def initialize(left, range, right, min, max)
|
13
|
+
@left = left
|
14
|
+
@range = range
|
15
|
+
@right = right
|
16
|
+
@min = min || range.min
|
17
|
+
@max = max || range.max
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :left, :range, :right, :min, :max
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(ranges, sorted: false)
|
24
|
+
# ranges.sort_by! {|r| [r.min, r.max]} unless sorted
|
25
|
+
# It's only required to be sorted by `r.min`, but if many ranges has the
|
26
|
+
# same left endpoint, then it's more efficient if also secondarily sorted by
|
27
|
+
# the right endpoint (or equivalently by the length).
|
28
|
+
|
29
|
+
@root = RangeTree.split(ranges.sort{|a,b| (a.min <=> b.min) || (a.max <=> b.max)})
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :root
|
33
|
+
|
34
|
+
def self.split(ranges)
|
35
|
+
return nil if ranges.empty?
|
36
|
+
|
37
|
+
middle = ranges.length/2
|
38
|
+
|
39
|
+
left = split(ranges.slice(0, middle)) # Handle middle == 0 correctly.
|
40
|
+
range = ranges[middle] # Current range.
|
41
|
+
right = split(ranges[(middle+1)..-1]) # Handle middle == ranges.length correctly.
|
42
|
+
|
43
|
+
ary = [left, range, right].compact
|
44
|
+
|
45
|
+
Node.new(left, range, right,
|
46
|
+
ary.map(&:min).min, # Subtree's min.
|
47
|
+
ary.map(&:max).max) # Subtree's max.
|
48
|
+
end
|
49
|
+
|
50
|
+
def search(range, limit: Float::INFINITY)
|
51
|
+
range = range.is_a?(Range) ? range : (range..range)
|
52
|
+
|
53
|
+
result = []
|
54
|
+
RangeTree.search_helper(range, @root, result, limit)
|
55
|
+
|
56
|
+
result
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.search_helper(q, root, result, limit)
|
60
|
+
return if root.nil?
|
61
|
+
|
62
|
+
# Visit left child?
|
63
|
+
if (l = root.left) and l.max and q.min and \
|
64
|
+
not l.max < q.min # The interesting part.
|
65
|
+
search_helper(q, root.left, result, limit)
|
66
|
+
end
|
67
|
+
|
68
|
+
return if result.length >= limit
|
69
|
+
# Yes, it needs to be checked here rather than in the top. Otherwise, at the
|
70
|
+
# point of checking, there wasn't added too many, but after left child has
|
71
|
+
# been checked, we might hit the limit and then, "this" will add one as
|
72
|
+
# well.
|
73
|
+
|
74
|
+
# Add root?
|
75
|
+
result << root.range if RangeTree.ranges_intersect?(q, root.range)
|
76
|
+
|
77
|
+
# Visit right child?
|
78
|
+
if (r = root.right) and q.max and r.min and \
|
79
|
+
not q.max < r.min # The interesting part.
|
80
|
+
search_helper(q, root.right, result, limit)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.ranges_intersect?(a, b)
|
85
|
+
return false unless a.min && a.max && b.min && b.max
|
86
|
+
|
87
|
+
a.min <= b.max && a.max >= b.min
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
2
|
+
|
3
|
+
# Both oga and minitest have stupid warnings that I don't want to
|
4
|
+
# hear about
|
5
|
+
|
6
|
+
verbose = $VERBOSE
|
7
|
+
$VERBOSE = nil
|
8
|
+
require 'oga'
|
9
|
+
require 'minitest'
|
10
|
+
require 'minitest/spec'
|
11
|
+
require 'minitest/autorun'
|
12
|
+
$VERBOSE = verbose
|
13
|
+
|
14
|
+
require 'high_level_browse'
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
TESTDIR = File.expand_path(File.dirname(__FILE__))
|
5
|
+
|
6
|
+
describe "loads" do
|
7
|
+
it "loads" do
|
8
|
+
assert true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "has a version" do
|
12
|
+
HighLevelBrowse::VERSION.wont_be_nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Works the same as before" do
|
17
|
+
it "gets the same output for 30k randomly chosen call numbers" do
|
18
|
+
h = HighLevelBrowse.fetch_and_save(dir: TESTDIR)
|
19
|
+
JSON.load(File.open(File.join(TESTDIR, '30k_random_old_mappings.json'))).each do |rec|
|
20
|
+
cn = rec['cn'].strip
|
21
|
+
newcats = h[cn]
|
22
|
+
next if rec['jar'].empty?
|
23
|
+
assert_equal [cn, rec['jar'].sort], [rec['cn'], newcats.sort]
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: high_level_browse
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Bill Dueber
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-06-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: oga
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: lcsort
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- bill@dueber.com
|
86
|
+
executables:
|
87
|
+
- fetch_new_hlb
|
88
|
+
- hlb
|
89
|
+
- test_marc_file_for_hlb
|
90
|
+
extensions: []
|
91
|
+
extra_rdoc_files: []
|
92
|
+
files:
|
93
|
+
- ".gitignore"
|
94
|
+
- ".travis.yml"
|
95
|
+
- Gemfile
|
96
|
+
- LICENSE.txt
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- bench/bench.rb
|
100
|
+
- bench/hlb.json.gz
|
101
|
+
- bin/fetch_new_hlb
|
102
|
+
- bin/hlb
|
103
|
+
- bin/test_marc_file_for_hlb
|
104
|
+
- high_level_browse.gemspec
|
105
|
+
- lib/high_level_browse.rb
|
106
|
+
- lib/high_level_browse/call_number_range.rb
|
107
|
+
- lib/high_level_browse/db.rb
|
108
|
+
- lib/high_level_browse/range_tree.rb
|
109
|
+
- lib/high_level_browse/version.rb
|
110
|
+
- test/minitest_helper.rb
|
111
|
+
- test/test_high_level_browse.rb
|
112
|
+
homepage: ''
|
113
|
+
licenses:
|
114
|
+
- MIT
|
115
|
+
metadata: {}
|
116
|
+
post_install_message:
|
117
|
+
rdoc_options: []
|
118
|
+
require_paths:
|
119
|
+
- lib
|
120
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
requirements: []
|
131
|
+
rubyforge_project:
|
132
|
+
rubygems_version: 2.6.8
|
133
|
+
signing_key:
|
134
|
+
specification_version: 4
|
135
|
+
summary: Map LC call numbers to academic categories.
|
136
|
+
test_files:
|
137
|
+
- test/minitest_helper.rb
|
138
|
+
- test/test_high_level_browse.rb
|