high_level_browse 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +13 -0
- data/bin/hlb +1 -1
- data/bin/test_marc_file_for_hlb +1 -2
- data/high_level_browse.gemspec +2 -3
- data/lib/high_level_browse/call_number_range.rb +74 -37
- data/lib/high_level_browse/db.rb +31 -38
- data/lib/high_level_browse/range_tree.rb +5 -2
- data/lib/high_level_browse/version.rb +1 -1
- data/test/minitest_helper.rb +0 -1
- data/test/test_high_level_browse.rb +0 -12
- metadata +13 -29
- data/bench/bench.rb +0 -57
- data/bench/hlb.json.gz +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5eae24ef8906dfb25f1f949b4b29b69a33834dc63a7b0d96cd356324b14a79df
|
4
|
+
data.tar.gz: 919434516a5098b1c4f9766434c68b7b7e6960128a0eb59abc7e442da520de5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aa8cb1cb8472c788c1def8efff185a8fc16957ef97450178687941f6dc5a03444f6d85b1c25f601aa34eef02ed51e8ab7475c9d067e2db35aa94118d319a7dc
|
7
|
+
data.tar.gz: 390752555d91aec5060be2a34097b9cc6ca664b22c44ae6b29a989e9cb766bc45b83599310d2fec2ee79fe3155678c41fc52c24f6a3658692ab7b2ac32bb513d
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# High Level Browse
|
2
|
+
|
3
|
+
## 1.0.0
|
4
|
+
|
5
|
+
* New normalization algorithm.
|
6
|
+
* Because the normalization algorithm has changed, the on-disk file
|
7
|
+
(`hlb.json.gz`) needs to be re-generated, because the old normalization
|
8
|
+
form and the new one won't match each other.
|
9
|
+
|
10
|
+
0.2.0
|
11
|
+
|
12
|
+
* First real release
|
13
|
+
|
data/bin/hlb
CHANGED
data/bin/test_marc_file_for_hlb
CHANGED
@@ -7,7 +7,6 @@ end
|
|
7
7
|
|
8
8
|
require 'marc'
|
9
9
|
require 'high_level_browse'
|
10
|
-
require 'lcsort'
|
11
10
|
require 'tmpdir'
|
12
11
|
|
13
12
|
|
@@ -38,7 +37,7 @@ Counter = Struct.new(:count, :invalid, :found, :notfound, :hlb) do
|
|
38
37
|
|
39
38
|
|
40
39
|
def check_cn(cn)
|
41
|
-
normalized =
|
40
|
+
normalized = HighLevelBrowse::CallNumberRange.callnumber_normalize(cn)
|
42
41
|
return :invalid if normalized.nil?
|
43
42
|
cats = hlb[cn]
|
44
43
|
if cats.empty?
|
data/high_level_browse.gemspec
CHANGED
@@ -17,10 +17,9 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
|
-
spec.add_dependency '
|
21
|
-
spec.add_dependency 'lcsort'
|
20
|
+
spec.add_dependency 'nokogiri', '~>1.0'
|
22
21
|
|
23
|
-
spec.add_development_dependency "bundler",
|
22
|
+
spec.add_development_dependency "bundler", '~>2.0'
|
24
23
|
spec.add_development_dependency "rake"
|
25
24
|
spec.add_development_dependency "minitest"
|
26
25
|
end
|
@@ -1,21 +1,25 @@
|
|
1
|
-
require 'lcsort'
|
2
1
|
require 'high_level_browse/range_tree'
|
3
2
|
|
4
|
-
|
5
3
|
# An efficient set of CallNumberRanges from which to get topics
|
6
4
|
class HighLevelBrowse::CallNumberRangeSet < HighLevelBrowse::RangeTree
|
7
5
|
|
6
|
+
ANY_DIGIT = /\d/.freeze
|
7
|
+
|
8
|
+
def has_digits(str)
|
9
|
+
ANY_DIGIT.match?(str)
|
10
|
+
end
|
8
11
|
|
9
12
|
# Returns the array of topic arrays for the given LC string
|
10
13
|
# @param [String] raw_lc A raw LC string (eg., 'qa 112.3 .A4 1990')
|
11
14
|
# @return [Array<Array<String>>] Arrays of topic labels
|
12
15
|
def topics_for(raw_lc)
|
13
|
-
normalized =
|
16
|
+
normalized = ::HighLevelBrowse::CallNumberRange.callnumber_normalize(raw_lc)
|
14
17
|
self.search(normalized).map(&:topic_array).uniq
|
18
|
+
rescue => e
|
19
|
+
require 'pry'; binding.pry
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
18
|
-
|
19
23
|
# A callnumber-range keeps track of the original begin/end
|
20
24
|
# strings as well as the normalized versions, and can be
|
21
25
|
# serialized to JSON
|
@@ -25,7 +29,6 @@ class HighLevelBrowse::CallNumberRange
|
|
25
29
|
|
26
30
|
attr_reader :min, :max, :min_raw, :max_raw, :firstletter
|
27
31
|
|
28
|
-
|
29
32
|
attr_accessor :topic_array, :redundant
|
30
33
|
|
31
34
|
SPACE_OR_PUNCT = /\A[\s\p{Punct}]*(.*?)[\s\p{Punct}]*\Z/
|
@@ -42,27 +45,71 @@ class HighLevelBrowse::CallNumberRange
|
|
42
45
|
def self.force_break_between_digit_and_letter(str)
|
43
46
|
str.gsub(DIGIT_TO_LETTER, '\1 \2')
|
44
47
|
end
|
48
|
+
|
45
49
|
# @nodoc
|
46
50
|
# Preprocess the string, removing spaces/punctuation off the end
|
47
51
|
# and forcing a space where there's a digit->letter transition
|
48
|
-
def self.preprocess(str)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
# def self.preprocess(str)
|
53
|
+
# str ||= ''
|
54
|
+
# force_break_between_digit_and_letter(strip_spaces_and_punct(str)
|
55
|
+
# )
|
56
|
+
# end
|
57
|
+
|
58
|
+
# Normalize the callnumber in a slightly more sane way
|
59
|
+
# @param [String] cn The raw callnumber to normalize
|
60
|
+
CN = /\A\s*(?<letters>\p{L}{1,3})\s*(?<digits>\d{1,5}(?!\d))(?:\.(?<decimals>\d+))?(?<rest>.*)\Z/.freeze
|
61
|
+
|
62
|
+
def self.callnumber_normalize(cs_str)
|
63
|
+
return nil if cs_str.nil?
|
64
|
+
|
65
|
+
cs_str = cs_str.downcase
|
66
|
+
return cs_str if /\A\s*\p{L}{1,3}+\s*\Z/.match? cs_str # just letters
|
67
|
+
|
68
|
+
m = CN.match(cs_str)
|
69
|
+
return nil unless m
|
70
|
+
|
71
|
+
digits = m[:digits].size.to_s + m[:digits]
|
72
|
+
decimals = m[:decimals] ? "." + m[:decimals] : ""
|
73
|
+
rest = cleanup_freetext(m[:rest])
|
74
|
+
clean = m[:letters] + digits + decimals + " " + rest
|
75
|
+
clean.strip.gsub(/\s+/, ' ')
|
53
76
|
end
|
54
77
|
|
78
|
+
# @param [String] str String to clean up
|
79
|
+
def self.cleanup_freetext(str)
|
80
|
+
return "" if str.nil?
|
81
|
+
|
82
|
+
s = str.strip
|
83
|
+
return s if s == ""
|
84
|
+
|
85
|
+
s = replace_dot_before_letter_with_space(s)
|
86
|
+
s = remove_dots_between_letters(s)
|
87
|
+
s = force_space_between_digit_and_letter(s)
|
88
|
+
s.strip.gsub(/\s+/, ' ')
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.replace_dot_before_letter_with_space(s)
|
92
|
+
s.gsub /\.(\p{L})/, '\\1'
|
93
|
+
end
|
94
|
+
|
95
|
+
# @param [String] str
|
96
|
+
def self.remove_dots_between_letters(str)
|
97
|
+
str.gsub(/(\p{L})\.(\p{L})/, '\\1\\2')
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.force_space_between_digit_and_letter(s)
|
101
|
+
s.gsub(/(\d)(\p{L})/, '\\1 \\2')
|
102
|
+
end
|
55
103
|
|
56
104
|
def initialize(min:, max:, topic_array:)
|
57
|
-
@illegal
|
58
|
-
@redundant
|
59
|
-
self.min
|
60
|
-
self.max
|
105
|
+
@illegal = false
|
106
|
+
@redundant = false
|
107
|
+
self.min = min
|
108
|
+
self.max = max
|
61
109
|
@topic_array = topic_array
|
62
110
|
@firstletter = self.min[0] unless @illegal
|
63
111
|
end
|
64
112
|
|
65
|
-
|
66
113
|
# Compare based on @min, then end
|
67
114
|
# @param [CallNumberRange] o the range to compare to
|
68
115
|
def <=>(o)
|
@@ -74,31 +121,25 @@ class HighLevelBrowse::CallNumberRange
|
|
74
121
|
end
|
75
122
|
|
76
123
|
def reconstitute(min, max, min_raw, max_raw, firstletter, topic_array)
|
77
|
-
@min
|
78
|
-
@max
|
79
|
-
@min_raw
|
80
|
-
@max_raw
|
124
|
+
@min = min
|
125
|
+
@max = max
|
126
|
+
@min_raw = min_raw
|
127
|
+
@max_raw = max_raw
|
81
128
|
@firstletter = firstletter
|
82
129
|
@topic_array = topic_array
|
83
130
|
end
|
84
131
|
|
85
|
-
|
86
132
|
# Two ranges are equal if their @min, @max, and topic array
|
87
133
|
# are all the same
|
88
134
|
# @param [CallNumberRange] o the range to compare to
|
89
135
|
def ==(other)
|
90
|
-
@min == other.min and
|
91
|
-
@max == other.max and
|
92
|
-
@topic_array == other.topic_array
|
136
|
+
@min == other.min and @max == other.max and @topic_array == other.topic_array
|
93
137
|
end
|
94
138
|
|
95
|
-
|
96
139
|
# @nodoc
|
97
140
|
# JSON roundtrip
|
98
141
|
def to_json(*a)
|
99
|
-
{
|
100
|
-
'json_class' => self.class.name,
|
101
|
-
'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
142
|
+
{'json_class' => self.class.name, 'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
102
143
|
}.to_json(*a)
|
103
144
|
end
|
104
145
|
|
@@ -109,29 +150,26 @@ class HighLevelBrowse::CallNumberRange
|
|
109
150
|
cnr
|
110
151
|
end
|
111
152
|
|
112
|
-
|
113
153
|
# In both @min= and end=, we also rescue any parsing errors
|
114
154
|
# and simply set the @illegal flag so we can use it later on.
|
115
155
|
def min=(x)
|
116
|
-
@min_raw
|
117
|
-
possible_min =
|
156
|
+
@min_raw = x
|
157
|
+
possible_min = self.class.callnumber_normalize(x)
|
118
158
|
if possible_min.nil? # didn't normalize
|
119
159
|
@illegal = true
|
120
160
|
nil
|
121
|
-
else
|
122
|
-
@min = possible_min
|
161
|
+
else @min = possible_min
|
123
162
|
end
|
124
163
|
end
|
125
164
|
|
126
165
|
# Same as start. Set the illegal flag if we get an error
|
127
166
|
def max=(x)
|
128
|
-
@max_raw
|
129
|
-
possible_max =
|
167
|
+
@max_raw = x
|
168
|
+
possible_max = self.class.callnumber_normalize(x)
|
130
169
|
if possible_max.nil? # didn't normalize
|
131
170
|
@illegal = true
|
132
171
|
nil
|
133
|
-
else
|
134
|
-
@max = possible_max + '~' # add a tilde to make it a true endpoint
|
172
|
+
else @max = possible_max + '~' # add a tilde to make it a true endpoint
|
135
173
|
end
|
136
174
|
end
|
137
175
|
|
@@ -139,7 +177,6 @@ class HighLevelBrowse::CallNumberRange
|
|
139
177
|
@illegal
|
140
178
|
end
|
141
179
|
|
142
|
-
|
143
180
|
def surrounds(other)
|
144
181
|
@min <= other.min and @max >= other.max
|
145
182
|
end
|
data/lib/high_level_browse/db.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'nokogiri'
|
2
2
|
require 'high_level_browse/call_number_range'
|
3
3
|
require 'zlib'
|
4
4
|
require 'json'
|
@@ -13,7 +13,7 @@ class HighLevelBrowse::DB
|
|
13
13
|
# database with an efficient structure for querying
|
14
14
|
# @param [Array<HighLevelBrowse::CallNumberRange>] array_of_ranges
|
15
15
|
def initialize(array_of_ranges)
|
16
|
-
@all
|
16
|
+
@all = array_of_ranges
|
17
17
|
@ranges = self.create_letter_indexed_ranges(@all)
|
18
18
|
end
|
19
19
|
|
@@ -22,8 +22,8 @@ class HighLevelBrowse::DB
|
|
22
22
|
# @private
|
23
23
|
def create_letter_indexed_ranges(all)
|
24
24
|
bins = {}
|
25
|
-
('
|
26
|
-
cnrs
|
25
|
+
('a'..'z').each do |letter|
|
26
|
+
cnrs = all.find_all { |x| x.firstletter == letter }
|
27
27
|
bins[letter] = HighLevelBrowse::CallNumberRangeSet.new(cnrs)
|
28
28
|
end
|
29
29
|
bins
|
@@ -40,16 +40,17 @@ class HighLevelBrowse::DB
|
|
40
40
|
# @return [Array<Array>] A (possibly empty) array of arrays of topics
|
41
41
|
def topics(*raw_callnumber_strings)
|
42
42
|
raw_callnumber_strings.reduce([]) do |acc, raw_callnumber_string|
|
43
|
-
firstletter = raw_callnumber_string.
|
43
|
+
firstletter = if raw_callnumber_string.nil?
|
44
|
+
nil
|
45
|
+
else raw_callnumber_string.to_s.strip.downcase[0]
|
46
|
+
end
|
44
47
|
if @ranges.has_key? firstletter
|
45
48
|
acc + @ranges[firstletter].topics_for(raw_callnumber_string)
|
46
|
-
else
|
47
|
-
acc
|
49
|
+
else acc
|
48
50
|
end
|
49
51
|
end.uniq
|
50
52
|
end
|
51
53
|
|
52
|
-
|
53
54
|
alias_method :[], :topics
|
54
55
|
|
55
56
|
# Create a new object from a string with the XML
|
@@ -58,12 +59,11 @@ class HighLevelBrowse::DB
|
|
58
59
|
# (e.g., from 'https://www.lib.umich.edu/browse/categories/xml.php')
|
59
60
|
# @return [DB]
|
60
61
|
def self.new_from_xml(xml)
|
61
|
-
|
62
|
-
simple_array_of_cnrs =
|
62
|
+
noko_doc_root = Nokogiri::XML(xml)
|
63
|
+
simple_array_of_cnrs = cnrs_within_noko_node(node: noko_doc_root)
|
63
64
|
self.new(simple_array_of_cnrs).freeze
|
64
65
|
end
|
65
66
|
|
66
|
-
|
67
67
|
# Save to disk
|
68
68
|
# @param [String] dir The directory where the hlb.json.gz file will be saved
|
69
69
|
# @return [DB] The loaded database
|
@@ -73,7 +73,6 @@ class HighLevelBrowse::DB
|
|
73
73
|
end
|
74
74
|
end
|
75
75
|
|
76
|
-
|
77
76
|
# Load from disk
|
78
77
|
# @param [String] dir The directory where the hlb.json.gz file is located
|
79
78
|
# @return [DB] The loaded database
|
@@ -81,12 +80,11 @@ class HighLevelBrowse::DB
|
|
81
80
|
simple_array_of_cnrs = Zlib::GzipReader.open(File.join(dir, FILENAME)) do |infile|
|
82
81
|
JSON.load(infile.read).to_a
|
83
82
|
end
|
84
|
-
db
|
83
|
+
db = self.new(simple_array_of_cnrs)
|
85
84
|
db.freeze
|
86
85
|
db
|
87
86
|
end
|
88
87
|
|
89
|
-
|
90
88
|
# Freeze everything
|
91
89
|
# @return [DB] the frozen db
|
92
90
|
def freeze
|
@@ -102,49 +100,44 @@ class HighLevelBrowse::DB
|
|
102
100
|
# * what the current topics are ([level1, level2])
|
103
101
|
# Get all the call numbers assocaited with the topic represented by the given node,
|
104
102
|
# as well as all the children of the given node, and send it back as a big ol' array
|
105
|
-
# @param [
|
103
|
+
# @param [Nokogiri::XML::Node] node A node of the parsed HLB XML file
|
106
104
|
# @param [Array<String>] decendent_xpaths A list of xpaths to the decendents of this node
|
107
105
|
# @param [Array<String>] topic_array An array with all levels of the topics associated with this node
|
108
106
|
# @return [Array<HighLevelBrowse::CallNumberRange>]
|
109
|
-
def self.
|
107
|
+
def self.cnrs_within_noko_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
|
110
108
|
if decendent_xpaths.empty?
|
111
109
|
[] # base case -- we're as low as we're going to go
|
112
|
-
else
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
node
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
cnrs
|
110
|
+
else current_xpath_component = decendent_xpaths[0]
|
111
|
+
new_xpath = decendent_xpaths[1..-1]
|
112
|
+
new_topic = topic_array.dup
|
113
|
+
new_topic.push node[:name] unless node == node.document # skip the root
|
114
|
+
cnrs = []
|
115
|
+
# For each sub-component, get both the call-number-ranges (cnrs) assocaited
|
116
|
+
# with this level, as well as recusively getting from all the children
|
117
|
+
node.xpath(current_xpath_component).each do |c|
|
118
|
+
cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
|
119
|
+
cnrs += cnrs_within_noko_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
|
120
|
+
end
|
121
|
+
cnrs
|
125
122
|
end
|
126
123
|
end
|
127
124
|
|
128
|
-
|
129
125
|
# Given a second-to-lowest-level node, get its topic and
|
130
126
|
# extract call number ranges from its children
|
131
127
|
def self.call_numbers_list_from_leaves(node:, topic_array:)
|
132
|
-
cnrs
|
133
|
-
new_topic = topic_array.dup.push node
|
128
|
+
cnrs = []
|
129
|
+
new_topic = topic_array.dup.push node[:name]
|
134
130
|
node.xpath('call-numbers').each do |cn_node|
|
135
|
-
min = cn_node
|
136
|
-
max = cn_node
|
131
|
+
min = cn_node[:start]
|
132
|
+
max = cn_node[:end]
|
137
133
|
|
138
134
|
new_cnr = HighLevelBrowse::CallNumberRange.new(min: min, max: max, topic_array: new_topic)
|
139
135
|
if new_cnr.illegal?
|
140
|
-
# do some sort of logging
|
141
|
-
else
|
142
|
-
cnrs.push new_cnr
|
136
|
+
# do some sort of logging else cnrs.push new_cnr
|
143
137
|
end
|
144
138
|
end
|
145
139
|
cnrs
|
146
140
|
|
147
141
|
end
|
148
142
|
|
149
|
-
|
150
143
|
end
|
@@ -48,8 +48,8 @@ module HighLevelBrowse
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def search(range, limit: Float::INFINITY)
|
51
|
+
return [] if range.nil?
|
51
52
|
range = range.is_a?(Range) ? range : (range..range)
|
52
|
-
|
53
53
|
result = []
|
54
54
|
RangeTree.search_helper(range, @root, result, limit)
|
55
55
|
|
@@ -58,7 +58,6 @@ module HighLevelBrowse
|
|
58
58
|
|
59
59
|
def self.search_helper(q, root, result, limit)
|
60
60
|
return if root.nil?
|
61
|
-
|
62
61
|
# Visit left child?
|
63
62
|
if (l = root.left) and l.max and q.min and \
|
64
63
|
not l.max < q.min # The interesting part.
|
@@ -70,6 +69,10 @@ module HighLevelBrowse
|
|
70
69
|
# point of checking, there wasn't added too many, but after left child has
|
71
70
|
# been checked, we might hit the limit and then, "this" will add one as
|
72
71
|
# well.
|
72
|
+
#
|
73
|
+
# (I'm leaving the above paragraph intact as a reminder to myself to
|
74
|
+
# read things over and make sure they're, you know, actual English. At this point
|
75
|
+
# in 2022 I have no idea what the heck I was saying.
|
73
76
|
|
74
77
|
# Add root?
|
75
78
|
result << root.range if RangeTree.ranges_intersect?(q, root.range)
|
data/test/minitest_helper.rb
CHANGED
@@ -13,15 +13,3 @@ describe "loads" do
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
describe "Works the same as before" do
|
17
|
-
it "gets the same output for 30k randomly chosen call numbers" do
|
18
|
-
h = HighLevelBrowse.fetch_and_save(dir: TESTDIR)
|
19
|
-
JSON.load(File.open(File.join(TESTDIR, '30k_random_old_mappings.json'))).each do |rec|
|
20
|
-
cn = rec['cn'].strip
|
21
|
-
newcats = h[cn]
|
22
|
-
next if rec['jar'].empty?
|
23
|
-
assert_equal [cn, rec['jar'].sort], [rec['cn'], newcats.sort]
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
metadata
CHANGED
@@ -1,57 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: high_level_browse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bill Dueber
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '1.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: lcsort
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
26
|
+
version: '1.0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: bundler
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
31
|
- - "~>"
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
33
|
+
version: '2.0'
|
48
34
|
type: :development
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
38
|
- - "~>"
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
40
|
+
version: '2.0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: rake
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,7 +66,7 @@ dependencies:
|
|
80
66
|
- - ">="
|
81
67
|
- !ruby/object:Gem::Version
|
82
68
|
version: '0'
|
83
|
-
description:
|
69
|
+
description:
|
84
70
|
email:
|
85
71
|
- bill@dueber.com
|
86
72
|
executables:
|
@@ -92,12 +78,11 @@ extra_rdoc_files: []
|
|
92
78
|
files:
|
93
79
|
- ".gitignore"
|
94
80
|
- ".travis.yml"
|
81
|
+
- CHANGELOG.md
|
95
82
|
- Gemfile
|
96
83
|
- LICENSE.txt
|
97
84
|
- README.md
|
98
85
|
- Rakefile
|
99
|
-
- bench/bench.rb
|
100
|
-
- bench/hlb.json.gz
|
101
86
|
- bin/fetch_new_hlb
|
102
87
|
- bin/hlb
|
103
88
|
- bin/test_marc_file_for_hlb
|
@@ -113,7 +98,7 @@ homepage: ''
|
|
113
98
|
licenses:
|
114
99
|
- MIT
|
115
100
|
metadata: {}
|
116
|
-
post_install_message:
|
101
|
+
post_install_message:
|
117
102
|
rdoc_options: []
|
118
103
|
require_paths:
|
119
104
|
- lib
|
@@ -128,9 +113,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
113
|
- !ruby/object:Gem::Version
|
129
114
|
version: '0'
|
130
115
|
requirements: []
|
131
|
-
|
132
|
-
|
133
|
-
signing_key:
|
116
|
+
rubygems_version: 3.3.3
|
117
|
+
signing_key:
|
134
118
|
specification_version: 4
|
135
119
|
summary: Map LC call numbers to academic categories.
|
136
120
|
test_files:
|
data/bench/bench.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
require 'benchmark/ips'
|
2
|
-
$:.unshift '../lib'
|
3
|
-
$:.unshift '.'
|
4
|
-
|
5
|
-
|
6
|
-
# On my laptop under normal load (e.g., not very scientific at all)
|
7
|
-
# I get the following running in a single thread
|
8
|
-
# ruby 2.3 ~8500 lookups/second
|
9
|
-
# ruby 2.4 ~9100 lookups/second
|
10
|
-
# jruby 9 ~20k lookups/second
|
11
|
-
# jruby 9, old HLB.jar ~6500 lookups/second
|
12
|
-
# jruby 1.7 error, can't do named arguments
|
13
|
-
# jruby 1.7, old HLB.jar ~6700 lookups/second
|
14
|
-
#
|
15
|
-
# The old HLB.jar has a different (worse) algorithm, but is of
|
16
|
-
# interest because it's what I'm writing this to replace.
|
17
|
-
|
18
|
-
# umich_traject holds .jar files with the old java implementation; see
|
19
|
-
# https://github.com/hathitrust/ht_traject/tree/9e8d414fd9bb2c79e243d289c4d39c05d2de27e5/lib/umich_traject
|
20
|
-
#
|
21
|
-
|
22
|
-
TEST_OLD_STUFF = defined? JRUBY_VERSION and Dir.exist?('./umich_traject')
|
23
|
-
if TEST_OLD_STUFF
|
24
|
-
puts "Loading old HLB3.jar stuff"
|
25
|
-
require 'umich_traject/jackson-core-asl-1.4.3.jar'
|
26
|
-
require 'umich_traject/jackson-mapper-asl-1.4.3.jar'
|
27
|
-
require 'umich_traject/apache-solr-umichnormalizers.jar'
|
28
|
-
require 'umich_traject/HLB3.jar'
|
29
|
-
java_import Java::edu.umich.lib.hlb::HLB
|
30
|
-
puts "Initializing HLB"
|
31
|
-
HLB.initialize()
|
32
|
-
end
|
33
|
-
|
34
|
-
require 'high_level_browse'
|
35
|
-
|
36
|
-
h = HighLevelBrowse.load(dir: '.')
|
37
|
-
|
38
|
-
cns = File.read('call_numbers.txt').split(/\n/).cycle
|
39
|
-
|
40
|
-
puts RUBY_DESCRIPTION
|
41
|
-
|
42
|
-
total = 0
|
43
|
-
Benchmark.ips do |x|
|
44
|
-
x.config(:time => 25, :warmup => 25)
|
45
|
-
|
46
|
-
x.report("HLB lookups") do
|
47
|
-
total += h[cns.next].count
|
48
|
-
end
|
49
|
-
|
50
|
-
if TEST_OLD_STUFF
|
51
|
-
total = 0
|
52
|
-
x.report("Old java lookups") do
|
53
|
-
total += HLB.categories(cns.next).to_a.count
|
54
|
-
end
|
55
|
-
x.compare!
|
56
|
-
end
|
57
|
-
end
|
data/bench/hlb.json.gz
DELETED
Binary file
|