high_level_browse 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 286d0ce64d0d9e8dffa58b716f111d086310654d
4
- data.tar.gz: 2a13aad07ee29e47b0bcc00f4ba16740491e9bfd
2
+ SHA256:
3
+ metadata.gz: 5eae24ef8906dfb25f1f949b4b29b69a33834dc63a7b0d96cd356324b14a79df
4
+ data.tar.gz: 919434516a5098b1c4f9766434c68b7b7e6960128a0eb59abc7e442da520de5e
5
5
  SHA512:
6
- metadata.gz: 9960852abc0686da303da11c8ead326df0ea7e7df89432962f7d1353e62350afbc7a3ad556d1beecfe6cce816c1bf654ce4bdee78bb195caefdb08caeb67b7cf
7
- data.tar.gz: 3d29b51feb0bd70c37eea28248eff4f3dccd8a38cdb23617be8998fdfa821e392743d0f87c25e9f233ee326b4097072ca63d3d04bcdf5c21216ec43a96ecae04
6
+ metadata.gz: 5aa8cb1cb8472c788c1def8efff185a8fc16957ef97450178687941f6dc5a03444f6d85b1c25f601aa34eef02ed51e8ab7475c9d067e2db35aa94118d319a7dc
7
+ data.tar.gz: 390752555d91aec5060be2a34097b9cc6ca664b22c44ae6b29a989e9cb766bc45b83599310d2fec2ee79fe3155678c41fc52c24f6a3658692ab7b2ac32bb513d
data/CHANGELOG.md ADDED
@@ -0,0 +1,13 @@
1
+ # High Level Browse
2
+
3
+ ## 1.0.0
4
+
5
+ * New normalization algorithm.
6
+ * Because the normalization algorithm has changed, the on-disk file
7
+ (`hlb.json.gz`) needs to be re-generated, because the old normalization
8
+ form and the new one won't match each other.
9
+
10
+ 0.2.0
11
+
12
+ * First real release
13
+
data/bin/hlb CHANGED
@@ -28,7 +28,7 @@ require 'tmpdir'
28
28
  filename = HighLevelBrowse::DB::FILENAME
29
29
  dir = Dir.tmpdir()
30
30
  fullpath = File.join(dir, filename)
31
-
31
+ puts fullpath
32
32
  hlb = if File.exist?(fullpath)
33
33
  HighLevelBrowse.load(dir: dir)
34
34
  else
@@ -7,7 +7,6 @@ end
7
7
 
8
8
  require 'marc'
9
9
  require 'high_level_browse'
10
- require 'lcsort'
11
10
  require 'tmpdir'
12
11
 
13
12
 
@@ -38,7 +37,7 @@ Counter = Struct.new(:count, :invalid, :found, :notfound, :hlb) do
38
37
 
39
38
 
40
39
  def check_cn(cn)
41
- normalized = Lcsort.normalize(cn)
40
+ normalized = HighLevelBrowse::CallNumberRange.callnumber_normalize(cn)
42
41
  return :invalid if normalized.nil?
43
42
  cats = hlb[cn]
44
43
  if cats.empty?
@@ -17,10 +17,9 @@ Gem::Specification.new do |spec|
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ["lib"]
19
19
 
20
- spec.add_dependency 'oga', '~> 2.1'
21
- spec.add_dependency 'lcsort'
20
+ spec.add_dependency 'nokogiri', '~>1.0'
22
21
 
23
- spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "bundler", '~>2.0'
24
23
  spec.add_development_dependency "rake"
25
24
  spec.add_development_dependency "minitest"
26
25
  end
@@ -1,21 +1,25 @@
1
- require 'lcsort'
2
1
  require 'high_level_browse/range_tree'
3
2
 
4
-
5
3
  # An efficient set of CallNumberRanges from which to get topics
6
4
  class HighLevelBrowse::CallNumberRangeSet < HighLevelBrowse::RangeTree
7
5
 
6
+ ANY_DIGIT = /\d/.freeze
7
+
8
+ def has_digits(str)
9
+ ANY_DIGIT.match?(str)
10
+ end
8
11
 
9
12
  # Returns the array of topic arrays for the given LC string
10
13
  # @param [String] raw_lc A raw LC string (eg., 'qa 112.3 .A4 1990')
11
14
  # @return [Array<Array<String>>] Arrays of topic labels
12
15
  def topics_for(raw_lc)
13
- normalized = Lcsort.normalize(HighLevelBrowse::CallNumberRange.preprocess(raw_lc))
16
+ normalized = ::HighLevelBrowse::CallNumberRange.callnumber_normalize(raw_lc)
14
17
  self.search(normalized).map(&:topic_array).uniq
18
+ rescue => e
19
+ require 'pry'; binding.pry
15
20
  end
16
21
  end
17
22
 
18
-
19
23
  # A callnumber-range keeps track of the original begin/end
20
24
  # strings as well as the normalized versions, and can be
21
25
  # serialized to JSON
@@ -25,7 +29,6 @@ class HighLevelBrowse::CallNumberRange
25
29
 
26
30
  attr_reader :min, :max, :min_raw, :max_raw, :firstletter
27
31
 
28
-
29
32
  attr_accessor :topic_array, :redundant
30
33
 
31
34
  SPACE_OR_PUNCT = /\A[\s\p{Punct}]*(.*?)[\s\p{Punct}]*\Z/
@@ -42,27 +45,71 @@ class HighLevelBrowse::CallNumberRange
42
45
  def self.force_break_between_digit_and_letter(str)
43
46
  str.gsub(DIGIT_TO_LETTER, '\1 \2')
44
47
  end
48
+
45
49
  # @nodoc
46
50
  # Preprocess the string, removing spaces/punctuation off the end
47
51
  # and forcing a space where there's a digit->letter transition
48
- def self.preprocess(str)
49
- str ||= ''
50
- force_break_between_digit_and_letter(
51
- strip_spaces_and_punct(str)
52
- )
52
+ # def self.preprocess(str)
53
+ # str ||= ''
54
+ # force_break_between_digit_and_letter(strip_spaces_and_punct(str)
55
+ # )
56
+ # end
57
+
58
+ # Normalize the callnumber in a slightly more sane way
59
+ # @param [String] cn The raw callnumber to normalize
60
+ CN = /\A\s*(?<letters>\p{L}{1,3})\s*(?<digits>\d{1,5}(?!\d))(?:\.(?<decimals>\d+))?(?<rest>.*)\Z/.freeze
61
+
62
+ def self.callnumber_normalize(cs_str)
63
+ return nil if cs_str.nil?
64
+
65
+ cs_str = cs_str.downcase
66
+ return cs_str if /\A\s*\p{L}{1,3}+\s*\Z/.match? cs_str # just letters
67
+
68
+ m = CN.match(cs_str)
69
+ return nil unless m
70
+
71
+ digits = m[:digits].size.to_s + m[:digits]
72
+ decimals = m[:decimals] ? "." + m[:decimals] : ""
73
+ rest = cleanup_freetext(m[:rest])
74
+ clean = m[:letters] + digits + decimals + " " + rest
75
+ clean.strip.gsub(/\s+/, ' ')
53
76
  end
54
77
 
78
+ # @param [String] str String to clean up
79
+ def self.cleanup_freetext(str)
80
+ return "" if str.nil?
81
+
82
+ s = str.strip
83
+ return s if s == ""
84
+
85
+ s = replace_dot_before_letter_with_space(s)
86
+ s = remove_dots_between_letters(s)
87
+ s = force_space_between_digit_and_letter(s)
88
+ s.strip.gsub(/\s+/, ' ')
89
+ end
90
+
91
+ def self.replace_dot_before_letter_with_space(s)
92
+ s.gsub /\.(\p{L})/, '\\1'
93
+ end
94
+
95
+ # @param [String] str
96
+ def self.remove_dots_between_letters(str)
97
+ str.gsub(/(\p{L})\.(\p{L})/, '\\1\\2')
98
+ end
99
+
100
+ def self.force_space_between_digit_and_letter(s)
101
+ s.gsub(/(\d)(\p{L})/, '\\1 \\2')
102
+ end
55
103
 
56
104
  def initialize(min:, max:, topic_array:)
57
- @illegal = false
58
- @redundant = false
59
- self.min = self.class.preprocess(min)
60
- self.max = self.class.preprocess(max)
105
+ @illegal = false
106
+ @redundant = false
107
+ self.min = min
108
+ self.max = max
61
109
  @topic_array = topic_array
62
110
  @firstletter = self.min[0] unless @illegal
63
111
  end
64
112
 
65
-
66
113
  # Compare based on @min, then end
67
114
  # @param [CallNumberRange] o the range to compare to
68
115
  def <=>(o)
@@ -74,31 +121,25 @@ class HighLevelBrowse::CallNumberRange
74
121
  end
75
122
 
76
123
  def reconstitute(min, max, min_raw, max_raw, firstletter, topic_array)
77
- @min = min
78
- @max = max
79
- @min_raw = min_raw
80
- @max_raw = max_raw
124
+ @min = min
125
+ @max = max
126
+ @min_raw = min_raw
127
+ @max_raw = max_raw
81
128
  @firstletter = firstletter
82
129
  @topic_array = topic_array
83
130
  end
84
131
 
85
-
86
132
  # Two ranges are equal if their @min, @max, and topic array
87
133
  # are all the same
88
134
  # @param [CallNumberRange] o the range to compare to
89
135
  def ==(other)
90
- @min == other.min and
91
- @max == other.max and
92
- @topic_array == other.topic_array
136
+ @min == other.min and @max == other.max and @topic_array == other.topic_array
93
137
  end
94
138
 
95
-
96
139
  # @nodoc
97
140
  # JSON roundtrip
98
141
  def to_json(*a)
99
- {
100
- 'json_class' => self.class.name,
101
- 'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
142
+ {'json_class' => self.class.name, 'data' => [@min, @max, @min_raw, @max_raw, @firstletter, @topic_array]
102
143
  }.to_json(*a)
103
144
  end
104
145
 
@@ -109,29 +150,26 @@ class HighLevelBrowse::CallNumberRange
109
150
  cnr
110
151
  end
111
152
 
112
-
113
153
  # In both @min= and end=, we also rescue any parsing errors
114
154
  # and simply set the @illegal flag so we can use it later on.
115
155
  def min=(x)
116
- @min_raw = x
117
- possible_min = Lcsort.normalize(x)
156
+ @min_raw = x
157
+ possible_min = self.class.callnumber_normalize(x)
118
158
  if possible_min.nil? # didn't normalize
119
159
  @illegal = true
120
160
  nil
121
- else
122
- @min = possible_min
161
+ else @min = possible_min
123
162
  end
124
163
  end
125
164
 
126
165
  # Same as start. Set the illegal flag if we get an error
127
166
  def max=(x)
128
- @max_raw = x
129
- possible_max = Lcsort.normalize(x)
167
+ @max_raw = x
168
+ possible_max = self.class.callnumber_normalize(x)
130
169
  if possible_max.nil? # didn't normalize
131
170
  @illegal = true
132
171
  nil
133
- else
134
- @max = possible_max + '~' # add a tilde to make it a true endpoint
172
+ else @max = possible_max + '~' # add a tilde to make it a true endpoint
135
173
  end
136
174
  end
137
175
 
@@ -139,7 +177,6 @@ class HighLevelBrowse::CallNumberRange
139
177
  @illegal
140
178
  end
141
179
 
142
-
143
180
  def surrounds(other)
144
181
  @min <= other.min and @max >= other.max
145
182
  end
@@ -1,4 +1,4 @@
1
- require 'oga'
1
+ require 'nokogiri'
2
2
  require 'high_level_browse/call_number_range'
3
3
  require 'zlib'
4
4
  require 'json'
@@ -13,7 +13,7 @@ class HighLevelBrowse::DB
13
13
  # database with an efficient structure for querying
14
14
  # @param [Array<HighLevelBrowse::CallNumberRange>] array_of_ranges
15
15
  def initialize(array_of_ranges)
16
- @all = array_of_ranges
16
+ @all = array_of_ranges
17
17
  @ranges = self.create_letter_indexed_ranges(@all)
18
18
  end
19
19
 
@@ -22,8 +22,8 @@ class HighLevelBrowse::DB
22
22
  # @private
23
23
  def create_letter_indexed_ranges(all)
24
24
  bins = {}
25
- ('A'..'Z').each do |letter|
26
- cnrs = all.find_all {|x| x.firstletter == letter}
25
+ ('a'..'z').each do |letter|
26
+ cnrs = all.find_all { |x| x.firstletter == letter }
27
27
  bins[letter] = HighLevelBrowse::CallNumberRangeSet.new(cnrs)
28
28
  end
29
29
  bins
@@ -40,16 +40,17 @@ class HighLevelBrowse::DB
40
40
  # @return [Array<Array>] A (possibly empty) array of arrays of topics
41
41
  def topics(*raw_callnumber_strings)
42
42
  raw_callnumber_strings.reduce([]) do |acc, raw_callnumber_string|
43
- firstletter = raw_callnumber_string.strip.upcase[0]
43
+ firstletter = if raw_callnumber_string.nil?
44
+ nil
45
+ else raw_callnumber_string.to_s.strip.downcase[0]
46
+ end
44
47
  if @ranges.has_key? firstletter
45
48
  acc + @ranges[firstletter].topics_for(raw_callnumber_string)
46
- else
47
- acc
49
+ else acc
48
50
  end
49
51
  end.uniq
50
52
  end
51
53
 
52
-
53
54
  alias_method :[], :topics
54
55
 
55
56
  # Create a new object from a string with the XML
@@ -58,12 +59,11 @@ class HighLevelBrowse::DB
58
59
  # (e.g., from 'https://www.lib.umich.edu/browse/categories/xml.php')
59
60
  # @return [DB]
60
61
  def self.new_from_xml(xml)
61
- oga_doc_root = Oga.parse_xml(xml)
62
- simple_array_of_cnrs = cnrs_within_oga_node(node: oga_doc_root)
62
+ noko_doc_root = Nokogiri::XML(xml)
63
+ simple_array_of_cnrs = cnrs_within_noko_node(node: noko_doc_root)
63
64
  self.new(simple_array_of_cnrs).freeze
64
65
  end
65
66
 
66
-
67
67
  # Save to disk
68
68
  # @param [String] dir The directory where the hlb.json.gz file will be saved
69
69
  # @return [DB] The loaded database
@@ -73,7 +73,6 @@ class HighLevelBrowse::DB
73
73
  end
74
74
  end
75
75
 
76
-
77
76
  # Load from disk
78
77
  # @param [String] dir The directory where the hlb.json.gz file is located
79
78
  # @return [DB] The loaded database
@@ -81,12 +80,11 @@ class HighLevelBrowse::DB
81
80
  simple_array_of_cnrs = Zlib::GzipReader.open(File.join(dir, FILENAME)) do |infile|
82
81
  JSON.load(infile.read).to_a
83
82
  end
84
- db = self.new(simple_array_of_cnrs)
83
+ db = self.new(simple_array_of_cnrs)
85
84
  db.freeze
86
85
  db
87
86
  end
88
87
 
89
-
90
88
  # Freeze everything
91
89
  # @return [DB] the frozen db
92
90
  def freeze
@@ -102,49 +100,44 @@ class HighLevelBrowse::DB
102
100
  # * what the current topics are ([level1, level2])
103
101
  # Get all the call numbers assocaited with the topic represented by the given node,
104
102
  # as well as all the children of the given node, and send it back as a big ol' array
105
- # @param [Oga::Node] node A node of the parsed HLB XML file
103
+ # @param [Nokogiri::XML::Node] node A node of the parsed HLB XML file
106
104
  # @param [Array<String>] decendent_xpaths A list of xpaths to the decendents of this node
107
105
  # @param [Array<String>] topic_array An array with all levels of the topics associated with this node
108
106
  # @return [Array<HighLevelBrowse::CallNumberRange>]
109
- def self.cnrs_within_oga_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
107
+ def self.cnrs_within_noko_node(node:, decendent_xpaths: ['/hlb/subject', 'topic', 'sub-topic'], topic_array: [])
110
108
  if decendent_xpaths.empty?
111
109
  [] # base case -- we're as low as we're going to go
112
- else
113
- current_xpath_component = decendent_xpaths[0]
114
- new_xpath = decendent_xpaths[1..-1]
115
- new_topic = topic_array.dup
116
- new_topic.push node.get(:name) unless node == node.root_node # skip the root
117
- cnrs = []
118
- # For each sub-component, get both the call-number-ranges (cnrs) assocaited
119
- # with this level, as well as recusively getting from all the children
120
- node.xpath(current_xpath_component).each do |c|
121
- cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
122
- cnrs += cnrs_within_oga_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
123
- end
124
- cnrs
110
+ else current_xpath_component = decendent_xpaths[0]
111
+ new_xpath = decendent_xpaths[1..-1]
112
+ new_topic = topic_array.dup
113
+ new_topic.push node[:name] unless node == node.document # skip the root
114
+ cnrs = []
115
+ # For each sub-component, get both the call-number-ranges (cnrs) assocaited
116
+ # with this level, as well as recusively getting from all the children
117
+ node.xpath(current_xpath_component).each do |c|
118
+ cnrs += call_numbers_list_from_leaves(node: c, topic_array: new_topic)
119
+ cnrs += cnrs_within_noko_node(node: c, decendent_xpaths: new_xpath, topic_array: new_topic)
120
+ end
121
+ cnrs
125
122
  end
126
123
  end
127
124
 
128
-
129
125
  # Given a second-to-lowest-level node, get its topic and
130
126
  # extract call number ranges from its children
131
127
  def self.call_numbers_list_from_leaves(node:, topic_array:)
132
- cnrs = []
133
- new_topic = topic_array.dup.push node.get(:name)
128
+ cnrs = []
129
+ new_topic = topic_array.dup.push node[:name]
134
130
  node.xpath('call-numbers').each do |cn_node|
135
- min = cn_node.get(:start)
136
- max = cn_node.get(:end)
131
+ min = cn_node[:start]
132
+ max = cn_node[:end]
137
133
 
138
134
  new_cnr = HighLevelBrowse::CallNumberRange.new(min: min, max: max, topic_array: new_topic)
139
135
  if new_cnr.illegal?
140
- # do some sort of logging
141
- else
142
- cnrs.push new_cnr
136
+ # do some sort of logging else cnrs.push new_cnr
143
137
  end
144
138
  end
145
139
  cnrs
146
140
 
147
141
  end
148
142
 
149
-
150
143
  end
@@ -48,8 +48,8 @@ module HighLevelBrowse
48
48
  end
49
49
 
50
50
  def search(range, limit: Float::INFINITY)
51
+ return [] if range.nil?
51
52
  range = range.is_a?(Range) ? range : (range..range)
52
-
53
53
  result = []
54
54
  RangeTree.search_helper(range, @root, result, limit)
55
55
 
@@ -58,7 +58,6 @@ module HighLevelBrowse
58
58
 
59
59
  def self.search_helper(q, root, result, limit)
60
60
  return if root.nil?
61
-
62
61
  # Visit left child?
63
62
  if (l = root.left) and l.max and q.min and \
64
63
  not l.max < q.min # The interesting part.
@@ -70,6 +69,10 @@ module HighLevelBrowse
70
69
  # point of checking, there wasn't added too many, but after left child has
71
70
  # been checked, we might hit the limit and then, "this" will add one as
72
71
  # well.
72
+ #
73
+ # (I'm leaving the above paragraph intact as a reminder to myself to
74
+ # read things over and make sure they're, you know, actual English. At this point
75
+ # in 2022 I have no idea what the heck I was saying.
73
76
 
74
77
  # Add root?
75
78
  result << root.range if RangeTree.ranges_intersect?(q, root.range)
@@ -1,3 +1,3 @@
1
1
  module HighLevelBrowse
2
- VERSION = "0.1.0"
2
+ VERSION = "1.0.0"
3
3
  end
@@ -5,7 +5,6 @@ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
5
5
 
6
6
  verbose = $VERBOSE
7
7
  $VERBOSE = nil
8
- require 'oga'
9
8
  require 'minitest'
10
9
  require 'minitest/spec'
11
10
  require 'minitest/autorun'
@@ -13,15 +13,3 @@ describe "loads" do
13
13
  end
14
14
  end
15
15
 
16
- describe "Works the same as before" do
17
- it "gets the same output for 30k randomly chosen call numbers" do
18
- h = HighLevelBrowse.fetch_and_save(dir: TESTDIR)
19
- JSON.load(File.open(File.join(TESTDIR, '30k_random_old_mappings.json'))).each do |rec|
20
- cn = rec['cn'].strip
21
- newcats = h[cn]
22
- next if rec['jar'].empty?
23
- assert_equal [cn, rec['jar'].sort], [rec['cn'], newcats.sort]
24
- end
25
-
26
- end
27
- end
metadata CHANGED
@@ -1,57 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: high_level_browse
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bill Dueber
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-02 00:00:00.000000000 Z
11
+ date: 2022-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: oga
14
+ name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: '1.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
27
- - !ruby/object:Gem::Dependency
28
- name: lcsort
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
26
+ version: '1.0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: bundler
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: '1.6'
33
+ version: '2.0'
48
34
  type: :development
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: '1.6'
40
+ version: '2.0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: rake
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -80,7 +66,7 @@ dependencies:
80
66
  - - ">="
81
67
  - !ruby/object:Gem::Version
82
68
  version: '0'
83
- description:
69
+ description:
84
70
  email:
85
71
  - bill@dueber.com
86
72
  executables:
@@ -92,12 +78,11 @@ extra_rdoc_files: []
92
78
  files:
93
79
  - ".gitignore"
94
80
  - ".travis.yml"
81
+ - CHANGELOG.md
95
82
  - Gemfile
96
83
  - LICENSE.txt
97
84
  - README.md
98
85
  - Rakefile
99
- - bench/bench.rb
100
- - bench/hlb.json.gz
101
86
  - bin/fetch_new_hlb
102
87
  - bin/hlb
103
88
  - bin/test_marc_file_for_hlb
@@ -113,7 +98,7 @@ homepage: ''
113
98
  licenses:
114
99
  - MIT
115
100
  metadata: {}
116
- post_install_message:
101
+ post_install_message:
117
102
  rdoc_options: []
118
103
  require_paths:
119
104
  - lib
@@ -128,9 +113,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
128
113
  - !ruby/object:Gem::Version
129
114
  version: '0'
130
115
  requirements: []
131
- rubyforge_project:
132
- rubygems_version: 2.6.8
133
- signing_key:
116
+ rubygems_version: 3.3.3
117
+ signing_key:
134
118
  specification_version: 4
135
119
  summary: Map LC call numbers to academic categories.
136
120
  test_files:
data/bench/bench.rb DELETED
@@ -1,57 +0,0 @@
1
- require 'benchmark/ips'
2
- $:.unshift '../lib'
3
- $:.unshift '.'
4
-
5
-
6
- # On my laptop under normal load (e.g., not very scientific at all)
7
- # I get the following running in a single thread
8
- # ruby 2.3 ~8500 lookups/second
9
- # ruby 2.4 ~9100 lookups/second
10
- # jruby 9 ~20k lookups/second
11
- # jruby 9, old HLB.jar ~6500 lookups/second
12
- # jruby 1.7 error, can't do named arguments
13
- # jruby 1.7, old HLB.jar ~6700 lookups/second
14
- #
15
- # The old HLB.jar has a different (worse) algorithm, but is of
16
- # interest because it's what I'm writing this to replace.
17
-
18
- # umich_traject holds .jar files with the old java implementation; see
19
- # https://github.com/hathitrust/ht_traject/tree/9e8d414fd9bb2c79e243d289c4d39c05d2de27e5/lib/umich_traject
20
- #
21
-
22
- TEST_OLD_STUFF = defined? JRUBY_VERSION and Dir.exist?('./umich_traject')
23
- if TEST_OLD_STUFF
24
- puts "Loading old HLB3.jar stuff"
25
- require 'umich_traject/jackson-core-asl-1.4.3.jar'
26
- require 'umich_traject/jackson-mapper-asl-1.4.3.jar'
27
- require 'umich_traject/apache-solr-umichnormalizers.jar'
28
- require 'umich_traject/HLB3.jar'
29
- java_import Java::edu.umich.lib.hlb::HLB
30
- puts "Initializing HLB"
31
- HLB.initialize()
32
- end
33
-
34
- require 'high_level_browse'
35
-
36
- h = HighLevelBrowse.load(dir: '.')
37
-
38
- cns = File.read('call_numbers.txt').split(/\n/).cycle
39
-
40
- puts RUBY_DESCRIPTION
41
-
42
- total = 0
43
- Benchmark.ips do |x|
44
- x.config(:time => 25, :warmup => 25)
45
-
46
- x.report("HLB lookups") do
47
- total += h[cns.next].count
48
- end
49
-
50
- if TEST_OLD_STUFF
51
- total = 0
52
- x.report("Old java lookups") do
53
- total += HLB.categories(cns.next).to_a.count
54
- end
55
- x.compare!
56
- end
57
- end
data/bench/hlb.json.gz DELETED
Binary file