high_level_browse 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +13 -0
- data/bin/hlb +1 -1
- data/bin/test_marc_file_for_hlb +1 -2
- data/high_level_browse.gemspec +2 -3
- data/lib/high_level_browse/call_number_range.rb +74 -37
- data/lib/high_level_browse/db.rb +31 -38
- data/lib/high_level_browse/range_tree.rb +5 -2
- data/lib/high_level_browse/version.rb +1 -1
- data/test/minitest_helper.rb +0 -1
- data/test/test_high_level_browse.rb +0 -12
- metadata +13 -29
- data/bench/bench.rb +0 -57
- data/bench/hlb.json.gz +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5eae24ef8906dfb25f1f949b4b29b69a33834dc63a7b0d96cd356324b14a79df
|
4
|
+
data.tar.gz: 919434516a5098b1c4f9766434c68b7b7e6960128a0eb59abc7e442da520de5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aa8cb1cb8472c788c1def8efff185a8fc16957ef97450178687941f6dc5a03444f6d85b1c25f601aa34eef02ed51e8ab7475c9d067e2db35aa94118d319a7dc
|
7
|
+
data.tar.gz: 390752555d91aec5060be2a34097b9cc6ca664b22c44ae6b29a989e9cb766bc45b83599310d2fec2ee79fe3155678c41fc52c24f6a3658692ab7b2ac32bb513d
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# High Level Browse
|
2
|
+
|
3
|
+
## 1.0.0
|
4
|
+
|
5
|
+
* New normalization algorithm.
|
6
|
+
* Because the normalization algorithm has changed, the on-disk file
|
7
|
+
(`hlb.json.gz`) needs to be re-generated, because the old normalization
|
8
|
+
form and the new one won't match each other.
|
9
|
+
|
10
|
+
0.2.0
|
11
|
+
|
12
|
+
* First real release
|
13
|
+
|
data/bin/hlb
CHANGED
data/bin/test_marc_file_for_hlb
CHANGED
@@ -7,7 +7,6 @@ end
|
|
7
7
|
|
8
8
|
require 'marc'
|
9
9
|
require 'high_level_browse'
|
10
|
-
require 'lcsort'
|
11
10
|
require 'tmpdir'
|
12
11
|
|
13
12
|
|
@@ -38,7 +37,7 @@ Counter = Struct.new(:count, :invalid, :found, :notfound, :hlb) do
|
|
38
37
|
|
39
38
|
|
40
39
|
def check_cn(cn)
|
41
|
-
normalized =
|
40
|
+
normalized = HighLevelBrowse::CallNumberRange.callnumber_normalize(cn)
|
42
41
|
return :invalid if normalized.nil?
|
43
42
|
cats = hlb[cn]
|
44
43
|
if cats.empty?
|
data/high_level_browse.gemspec
CHANGED
@@ -17,10 +17,9 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
|
-
spec.add_dependency '
|
21
|
-
spec.add_dependency 'lcsort'
|
20
|
+
spec.add_dependency 'nokogiri', '~>1.0'
|
22
21
|
|
23
|
-
spec.add_development_dependency "bundler",
|
22
|
+
spec.add_development_dependency "bundler", '~>2.0'
|
24
23
|
spec.add_development_dependency "rake"
|
25
24
|
spec.add_development_dependency "minitest"
|
26
25
|
end
|
@@ -1,21 +1,25 @@
|
|
1
|
-
require 'lcsort'
|
2
1
|
require 'high_level_browse/range_tree'
|
3
2
|
|
4
|
-
|
5
3
|
# An efficient set of CallNumberRanges from which to get topics
|
6
4
|
class HighLevelBrowse::CallNumberRangeSet < HighLevelBrowse::RangeTree
|
7
5
|
|
6
|
+
ANY_DIGIT = /\d/.freeze
|
7
|
+
|
8
|
+
def has_digits(str)
|
9
|
+
ANY_DIGIT.match?(str)
|
10
|
+
end
|
8
11
|
|
9
12
|
# Returns the array of topic arrays for the given LC string
|
10
13
|
# @param [String] raw_lc A raw LC string (eg., 'qa 112.3 .A4 1990')
|
11
14
|
# @return [Array<Array<String>>] Arrays of topic labels
|
12
15
|
def topics_for(raw_lc)
|
13
|
-
normalized =
|
16
|
+
normalized = ::HighLevelBrowse::CallNumberRange.callnumber_normalize(raw_lc)
|
14
17
|
self.search(normalized).map(&:topic_array).uniq
|
18
|
+
rescue => e
|
19
|
+
require 'pry'; binding.pry
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
18
|
-
|
19
23
|
# A callnumber-range keeps track of the original begin/end
|
20
24
|
# strings as well as the normalized versions, and can be
|
21
25
|
# serialized to JSON
|
@@ -25,7 +29,6 @@ class HighLevelBrowse::CallNumberRange
|
|
25
29
|
|
26
30
|
attr_reader :min, :max, :min_raw, :max_raw, :firstletter
|
27
31
|
|
28
|
-
|
29
32
|
attr_accessor :topic_array, :redundant
|
30
33
|
|
31
34
|
SPACE_OR_PUNCT = /\A[\s\p{Punct}]*(.*?)[\s\p{Punct}]*\Z/
|
@@ -42,27 +45,71 @@ class HighLevelBrowse::CallNumberRange
|
|
42
45
|
def self.force_break_between_digit_and_letter(str)
|
43
46
|
str.gsub(DIGIT_TO_LETTER, '\1 \2')
|
44
47
|
end
|
48
|
+
|
45
49
|
# @nodoc
|
46
50
|
# Preprocess the string, removing spaces/punctuation off the end
|
47
51
|
# and forcing a space where there's a digit->letter transition
|
48
|
-
def self.preprocess(str)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
# def self.preprocess(str)
|
53
|
+
# str ||= ''
|
54
|
+
# force_break_between_digit_and_letter(strip_spaces_and_punct(str)
|
55
|
+
# )
|
56
|
+
# end
|
57
|
+
|
58
|
+
# Normalize the callnumber in a slightly more sane way
|
59
|
+
# @param [String] cn The raw callnumber to normalize
|
60
|
+
CN = /\A\s*(?<letters>\p{L}{1,3})\s*(?<digits>\d{1,5}(?!\d))(?:\.(?<decimals>\d+))?(?<rest>.*)\Z/.freeze
|
61
|
+
|
62
|
+
def self.callnumber_normalize(cs_str)
|
63
|
+
return nil if cs_str.nil?
|
64
|
+
|
65
|
+
cs_str = cs_str.downcase
|
66
|
+
return cs_str if /\A\s*\p{L}{1,3}+\s*\Z/.match? cs_str # just letters
|
67
|
+
|
68
|
+
m = CN.match(cs_str)
|
69
|
+
return nil unless m
|
70
|
+
|
71
|
+
digits = m[:digits].size.to_s + m[:digits]
|
72
|
+
decimals = m[:decimals] ? "." + m[:decimals] : ""
|
73
|
+
rest = cleanup_freetext(m[:rest])
|
74
|
+
clean = m[:letters] + digits + decimals + " " + rest
|
75
|
+
clean.strip.gsub(/\s+/, ' ')
|
53
76
|
end
|
54
77
|
|
78
|
+
# @param [String] str String to clean up
|
79
|
+
def self.cleanup_freetext(str)
|
80
|
+
return "" if str.nil?
|
81
|
+
|
82
|
+
s = str.strip
|
83
|
+
return s if s == ""
|
84
|
+
|
85
|
+
s = replace_dot_before_letter_with_space(s)
|
86
|
+
s = remove_dots_between_letters(s)
|
87
|
+
s = force_space_between_digit_and_letter(s)
|
88
|
+
s.strip.gsub(/\s+/, ' ')
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.replace_dot_before_letter_with_space(s)
|
92
|
+
s.gsub /\.(\p{L})/, '\\1'
|
93
|
+
end
|
94
|
+
|
95
|
+
# @param [String] str
|
96
|
+
def self.remove_dots_between_letters(str)
|
97
|
+
str.gsub(/(\p{L})\.(\p{L})/, '\\1\\2')
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.force_space_between_digit_and_letter(s)
|
101
|
+
s.gsub(/(\d)(\p{L})/, '\\1 \\2')
|
102
|
+
end
|
55
103
|
|
56
104
|
def initialize(min:, max:, topic_array:)
|
57
|
-
@illegal
|
58
|
-
@redundant
|
59
|
-
self.min
|
60
|
-
self.max
|
105
|
+
@illegal = false
|
106
|
+
@redundant = false
|
107
|
+
self.min = min
|
108
|
+
self.max = max
|
61
109
|
@topic_array = topic_array
|
62
110
|
@firstletter = self.min[0] unless @illegal
|
63
111
|
end
|
64
112
|
|
65
|
-
|
66
113
|
# Compare based on @min, then end
|
67
114
|
# @param [CallNumberRange] o the range to compare to
|
68
115
|
def <=>(o)
|
@@ -74,31 +121,25 @@ class HighLevelBrowse::CallNumberRange
|
|
74
121
|
end
|
75
122
|
|
76
123
|
def reconstitute(min, max, min_raw, max_raw, firstletter, topic_array)
|
77
|
-
@min
|
78
|
-
@max
|
79
|
-
@min_raw
|
80
|
-
@max_raw
|
124
|
+
@min = min
|
125
|
+
@max = max
|
126
|
+
@min_raw = min_raw
|
127
|
+
@max_raw = max_raw
|
81
128
|
@firstletter = firstletter
|
82
129
|
@topic_array = topic_array
|
83
130
|
end
|
84
131
|
|
85
|
-
|
86
132
|
# Two ranges are equal if their @min, @max, and topic array
|
87
133
|
# are all the same
|
88
134
|
# @param [CallNumberRange] o the range to compare to
|
89
135
|
def ==(other)
|
90
|
-
@min == other.min and
|
91
|
-
@max == other.max and
|
92
|
-
@topic_array == other.topic_array
|
136
|
+
@min == other.min and @max == other.max and @topic_array == other.topic_array
|
93
137
|
end
|
94
138
|
|
95
|
-
|
96
139
|
# @nodoc
|
97
140
|
# JSON roundtrip
|
98
141
|
def to_json(*a)
|
99
|
-
{
|
100
|
-
'json_class' => self.class.name,
|
101
|
-
'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
142
|
+
{'json_class' => self.class.name, 'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
|
102
143
|
}.to_json(*a)
|
103
144
|
end
|
104
145
|
|
@@ -109,29 +150,26 @@ class HighLevelBrowse::CallNumberRange
|
|
109
150
|
cnr
|
110
151
|
end
|
111
152
|
|
112
|
-
|
113
153
|
# In both @min= and end=, we also rescue any parsing errors
|
114
154
|
# and simply set the @illegal flag so we can use it later on.
|
115
155
|
def min=(x)
|
116
|
-
@min_raw
|
117
|
-
possible_min =
|
156
|
+
@min_raw = x
|
157
|
+
possible_min = self.class.callnumber_normalize(x)
|
118
158
|
if possible_min.nil? # didn't normalize
|
119
159
|
@illegal = true
|
120
160
|
nil
|
121
|
-
else
|
122
|
-
@min = possible_min
|
161
|
+
else @min = possible_min
|
123
162
|
end
|
124
163
|
end
|
125
164
|
|
126
165
|
# Same as start. Set the illegal flag if we get an error
|
127
166
|
def max=(x)
|
128
|
-
@max_raw
|
129
|
-
possible_max =
|
167
|
+
@max_raw = x
|
168
|
+
possible_max = self.class.callnumber_normalize(x)
|
130
169
|
if possible_max.nil? # didn't normalize
|
131
170
|
@illegal = true
|
132
171
|
nil
|
133
|
-
else
|
134
|
-
@max = possible_max + '~' # add a tilde to make it a true endpoint
|
172
|
+
else @max = possible_max + '~' # add a tilde to make it a true endpoint
|
135
173
|
end
|
136
174
|
end
|
137
175
|
|
@@ -139,7 +177,6 @@ class HighLevelBrowse::CallNumberRange
|
|
139
177
|
@illegal
|
140
178
|
end
|
141
179
|
|
142
|
-
|
143
180
|
def surrounds(other)
|
144
181
|
@min <= other.min and @max >= other.max
|
145
182
|
end
|
data/lib/high_level_browse/db.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'nokogiri'
|
2
2
|
require 'high_level_browse/call_number_range'
|
3
3
|
require 'zlib'
|
4
4
|
require 'json'
|
@@ -13,7 +13,7 @@ class HighLevelBrowse::DB
|
|
13
13
|
# database with an efficient structure for querying
|
14
14
|
# @param [Array<HighLevelBrowse::CallNumberRange>] array_of_ranges
|
15
15
|
def initialize(array_of_ranges)
|
16
|
-
@all
|
16
|
+
@all = array_of_ranges
|
17
17
|
@ranges = self.create_letter_indexed_ranges(@all)
|
18
18
|
end
|
19
19
|
|
@@ -22,8 +22,8 @@ class HighLevelBrowse::DB
|
|
22
22
|
# @private
|
23
23
|
def create_letter_indexed_ranges(all)
|
24
24
|
bins = {}
|
25
|
-
('
|
26
|
-
cnrs
|
25
|
+
('a'..'z').each do |letter|
|
26
|
+
cnrs = all.find_all { |x| x.firstletter == letter }
|
27
27
|
bins[letter] = HighLevelBrowse::CallNumberRangeSet.new(cnrs)
|
28
28
|
end
|
29
29
|
bins
|
@@ -40,16 +40,17 @@ class HighLevelBrowse::DB
|
|
40
40
|
# @return [Array<Array>] A (possibly empty) array of arrays of topics
|
41
41
|
def topics(*raw_callnumber_strings)
|
42
42
|
raw_callnumber_strings.reduce([]) do |acc, raw_callnumber_string|
|
43
|
-
firstletter = raw_callnumber_string.
|
43
|
+
firstletter = if raw_callnumber_string.nil?
|
44
|
+
nil
|
45
|
+
else raw_callnumber_string.to_s.strip.downcase[0]
|
46
|
+
end
|
44
47
|
if @ranges.has_key? firstletter
|
45
48
|
acc + @ranges[firstletter].topics_for(raw_callnumber_string)
|
46
|
-
else
|
47
|
-
acc
|
49
|
+
else acc
|
48
50
|
end
|
49
51
|
end.uniq
|
50
52
|
end
|
51
53
|
|
52
|
-
|
53
54
|
alias_method :[], :topics
|
54
55
|
|
55
56
|
# Create a new object from a string with the XML
|
@@ -58,12 +59,11 @@ class HighLevelBrowse::DB
|
|
58
59
|
# (e.g., from 'https://www.lib.umich.edu/browse/categories/xml.php')
|
59
60
|
# @return [DB]
|
60
61
|
def self.new_from_xml(xml)
|
61
|
-
|
62
|
-
simple_array_of_cnrs =
|
62
|
+
noko_doc_root = Nokogiri::XML(xml)
|
63
|
+
simple_array_of_cnrs = cnrs_within_noko_node(node: noko_doc_root)
|
63
64
|
self.new(simple_array_of_cnrs).freeze
|
64
65
|
end
|
65
66
|
|
66
|
-
|
67
67
|
# Save to disk
|
68
68
|
# @param [String] dir The directory where the hlb.json.gz file will be saved
|
69
69
|
# @return [DB] The loaded database
|
@@ -73,7 +73,6 @@ class HighLevelBrowse::DB
|
|
73
73
|
end
|
74
74
|
end
|
75
75
|
|
76
|
-
|
77
76
|
# Load from disk
|
78
77
|
# @param [String] dir The directory where the hlb.json.gz file is located
|
79
78
|
# @return [DB] The loaded database
|
@@ -81,12 +80,11 @@ class HighLevelBrowse::DB
|
|
81
80
|
simple_array_of_cnrs = Zlib::GzipReader.open(File.join(dir, FILENAME)) do |infile|
|
82
81
|
JSON.load(infile.read).to_a
|
83
82
|
end
|
84
|
-
db
|
83
|
+
db = self.new(simple_array_of_cnrs)
|
85
84
|
db.freeze
|
86
85
|
db
|
87
86
|
end
|
88
87
|
|
89
|
-
|
90
88
|
# Freeze everything
|
91
89
|
# @return [DB] the frozen db
|
92
90
|
def freeze
|
@@ -102,49 +100,44 @@ class HighLevelBrowse::DB
|
|
102
100
|
# * what the current topics are ([level1, level2])
|
103
101
|
# Get all the call numbers assocaited with the topic represented by the given node,
|
104
102
|
# as well as all the children of the given node, and send it back as a big ol' array
|
105
|
-
# @param [
|
103
|
+
# @param [Nokogiri::XML::Node] node A node of the parsed HLB XML file
|
106
104
|
# @param [Array<String>] decendent_xpaths A list of xpaths to the decendents of this node
|
107
105
|
# @param [Array<String>] topic_array An array with all levels of the topics associated with this node
|
108
106
|
# @return [Array<HighLevelBrowse::CallNumberRange>]
|
109
|
-
def self.
|
107
|
+
def self.cnrs_within_noko_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
|
110
108
|
if decendent_xpaths.empty?
|
111
109
|
[] # base case -- we're as low as we're going to go
|
112
|
-
else
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
node
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
cnrs
|
110
|
+
else current_xpath_component = decendent_xpaths[0]
|
111
|
+
new_xpath = decendent_xpaths[1..-1]
|
112
|
+
new_topic = topic_array.dup
|
113
|
+
new_topic.push node[:name] unless node == node.document # skip the root
|
114
|
+
cnrs = []
|
115
|
+
# For each sub-component, get both the call-number-ranges (cnrs) assocaited
|
116
|
+
# with this level, as well as recusively getting from all the children
|
117
|
+
node.xpath(current_xpath_component).each do |c|
|
118
|
+
cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
|
119
|
+
cnrs += cnrs_within_noko_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
|
120
|
+
end
|
121
|
+
cnrs
|
125
122
|
end
|
126
123
|
end
|
127
124
|
|
128
|
-
|
129
125
|
# Given a second-to-lowest-level node, get its topic and
|
130
126
|
# extract call number ranges from its children
|
131
127
|
def self.call_numbers_list_from_leaves(node:, topic_array:)
|
132
|
-
cnrs
|
133
|
-
new_topic = topic_array.dup.push node
|
128
|
+
cnrs = []
|
129
|
+
new_topic = topic_array.dup.push node[:name]
|
134
130
|
node.xpath('call-numbers').each do |cn_node|
|
135
|
-
min = cn_node
|
136
|
-
max = cn_node
|
131
|
+
min = cn_node[:start]
|
132
|
+
max = cn_node[:end]
|
137
133
|
|
138
134
|
new_cnr = HighLevelBrowse::CallNumberRange.new(min: min, max: max, topic_array: new_topic)
|
139
135
|
if new_cnr.illegal?
|
140
|
-
# do some sort of logging
|
141
|
-
else
|
142
|
-
cnrs.push new_cnr
|
136
|
+
# do some sort of logging else cnrs.push new_cnr
|
143
137
|
end
|
144
138
|
end
|
145
139
|
cnrs
|
146
140
|
|
147
141
|
end
|
148
142
|
|
149
|
-
|
150
143
|
end
|
@@ -48,8 +48,8 @@ module HighLevelBrowse
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def search(range, limit: Float::INFINITY)
|
51
|
+
return [] if range.nil?
|
51
52
|
range = range.is_a?(Range) ? range : (range..range)
|
52
|
-
|
53
53
|
result = []
|
54
54
|
RangeTree.search_helper(range, @root, result, limit)
|
55
55
|
|
@@ -58,7 +58,6 @@ module HighLevelBrowse
|
|
58
58
|
|
59
59
|
def self.search_helper(q, root, result, limit)
|
60
60
|
return if root.nil?
|
61
|
-
|
62
61
|
# Visit left child?
|
63
62
|
if (l = root.left) and l.max and q.min and \
|
64
63
|
not l.max < q.min # The interesting part.
|
@@ -70,6 +69,10 @@ module HighLevelBrowse
|
|
70
69
|
# point of checking, there wasn't added too many, but after left child has
|
71
70
|
# been checked, we might hit the limit and then, "this" will add one as
|
72
71
|
# well.
|
72
|
+
#
|
73
|
+
# (I'm leaving the above paragraph intact as a reminder to myself to
|
74
|
+
# read things over and make sure they're, you know, actual English. At this point
|
75
|
+
# in 2022 I have no idea what the heck I was saying.
|
73
76
|
|
74
77
|
# Add root?
|
75
78
|
result << root.range if RangeTree.ranges_intersect?(q, root.range)
|
data/test/minitest_helper.rb
CHANGED
@@ -13,15 +13,3 @@ describe "loads" do
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
describe "Works the same as before" do
|
17
|
-
it "gets the same output for 30k randomly chosen call numbers" do
|
18
|
-
h = HighLevelBrowse.fetch_and_save(dir: TESTDIR)
|
19
|
-
JSON.load(File.open(File.join(TESTDIR, '30k_random_old_mappings.json'))).each do |rec|
|
20
|
-
cn = rec['cn'].strip
|
21
|
-
newcats = h[cn]
|
22
|
-
next if rec['jar'].empty?
|
23
|
-
assert_equal [cn, rec['jar'].sort], [rec['cn'], newcats.sort]
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
metadata
CHANGED
@@ -1,57 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: high_level_browse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bill Dueber
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '1.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: lcsort
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
26
|
+
version: '1.0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: bundler
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
31
|
- - "~>"
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
33
|
+
version: '2.0'
|
48
34
|
type: :development
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
38
|
- - "~>"
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
40
|
+
version: '2.0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: rake
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,7 +66,7 @@ dependencies:
|
|
80
66
|
- - ">="
|
81
67
|
- !ruby/object:Gem::Version
|
82
68
|
version: '0'
|
83
|
-
description:
|
69
|
+
description:
|
84
70
|
email:
|
85
71
|
- bill@dueber.com
|
86
72
|
executables:
|
@@ -92,12 +78,11 @@ extra_rdoc_files: []
|
|
92
78
|
files:
|
93
79
|
- ".gitignore"
|
94
80
|
- ".travis.yml"
|
81
|
+
- CHANGELOG.md
|
95
82
|
- Gemfile
|
96
83
|
- LICENSE.txt
|
97
84
|
- README.md
|
98
85
|
- Rakefile
|
99
|
-
- bench/bench.rb
|
100
|
-
- bench/hlb.json.gz
|
101
86
|
- bin/fetch_new_hlb
|
102
87
|
- bin/hlb
|
103
88
|
- bin/test_marc_file_for_hlb
|
@@ -113,7 +98,7 @@ homepage: ''
|
|
113
98
|
licenses:
|
114
99
|
- MIT
|
115
100
|
metadata: {}
|
116
|
-
post_install_message:
|
101
|
+
post_install_message:
|
117
102
|
rdoc_options: []
|
118
103
|
require_paths:
|
119
104
|
- lib
|
@@ -128,9 +113,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
113
|
- !ruby/object:Gem::Version
|
129
114
|
version: '0'
|
130
115
|
requirements: []
|
131
|
-
|
132
|
-
|
133
|
-
signing_key:
|
116
|
+
rubygems_version: 3.3.3
|
117
|
+
signing_key:
|
134
118
|
specification_version: 4
|
135
119
|
summary: Map LC call numbers to academic categories.
|
136
120
|
test_files:
|
data/bench/bench.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
require 'benchmark/ips'
|
2
|
-
$:.unshift '../lib'
|
3
|
-
$:.unshift '.'
|
4
|
-
|
5
|
-
|
6
|
-
# On my laptop under normal load (e.g., not very scientific at all)
|
7
|
-
# I get the following running in a single thread
|
8
|
-
# ruby 2.3 ~8500 lookups/second
|
9
|
-
# ruby 2.4 ~9100 lookups/second
|
10
|
-
# jruby 9 ~20k lookups/second
|
11
|
-
# jruby 9, old HLB.jar ~6500 lookups/second
|
12
|
-
# jruby 1.7 error, can't do named arguments
|
13
|
-
# jruby 1.7, old HLB.jar ~6700 lookups/second
|
14
|
-
#
|
15
|
-
# The old HLB.jar has a different (worse) algorithm, but is of
|
16
|
-
# interest because it's what I'm writing this to replace.
|
17
|
-
|
18
|
-
# umich_traject holds .jar files with the old java implementation; see
|
19
|
-
# https://github.com/hathitrust/ht_traject/tree/9e8d414fd9bb2c79e243d289c4d39c05d2de27e5/lib/umich_traject
|
20
|
-
#
|
21
|
-
|
22
|
-
TEST_OLD_STUFF = defined? JRUBY_VERSION and Dir.exist?('./umich_traject')
|
23
|
-
if TEST_OLD_STUFF
|
24
|
-
puts "Loading old HLB3.jar stuff"
|
25
|
-
require 'umich_traject/jackson-core-asl-1.4.3.jar'
|
26
|
-
require 'umich_traject/jackson-mapper-asl-1.4.3.jar'
|
27
|
-
require 'umich_traject/apache-solr-umichnormalizers.jar'
|
28
|
-
require 'umich_traject/HLB3.jar'
|
29
|
-
java_import Java::edu.umich.lib.hlb::HLB
|
30
|
-
puts "Initializing HLB"
|
31
|
-
HLB.initialize()
|
32
|
-
end
|
33
|
-
|
34
|
-
require 'high_level_browse'
|
35
|
-
|
36
|
-
h = HighLevelBrowse.load(dir: '.')
|
37
|
-
|
38
|
-
cns = File.read('call_numbers.txt').split(/\n/).cycle
|
39
|
-
|
40
|
-
puts RUBY_DESCRIPTION
|
41
|
-
|
42
|
-
total = 0
|
43
|
-
Benchmark.ips do |x|
|
44
|
-
x.config(:time => 25, :warmup => 25)
|
45
|
-
|
46
|
-
x.report("HLB lookups") do
|
47
|
-
total += h[cns.next].count
|
48
|
-
end
|
49
|
-
|
50
|
-
if TEST_OLD_STUFF
|
51
|
-
total = 0
|
52
|
-
x.report("Old java lookups") do
|
53
|
-
total += HLB.categories(cns.next).to_a.count
|
54
|
-
end
|
55
|
-
x.compare!
|
56
|
-
end
|
57
|
-
end
|
data/bench/hlb.json.gz
DELETED
Binary file
|