simhash2 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 8e8190a872a7ec498e0a500482e8814758de6cb4
4
- data.tar.gz: b0ea69a076ab1a99584010b3c85b6c3d17d37eea
2
+ SHA256:
3
+ metadata.gz: ef3ff21591cb53fba63b03c7623c6cadb3874c6d2f67ac9e817c3533b9f9c3d8
4
+ data.tar.gz: aca99aa445defc207028a58948df20a293abdae66a2a3bf201a96bf9ccf19d49
5
5
  SHA512:
6
- metadata.gz: 290b5f9daf27c8d4a138e26aabc062b4c8896fb6506bb41da13ea5c1e44095286e82b9b34780b189d22fdae1e4f4d34d7dc0ce6438b9c47539584b82fb23a541
7
- data.tar.gz: 5b961fbf6271a50571d557f628ef111ca7f9683378869bc9443635cb678b0a06942424287e5e6103702074f604a8ac39a06d2a2120c15b3bbfc26ee77d1cc5ab
6
+ metadata.gz: f9d2b12eaad8fd994a3e2f68aef4f75aa0b34358f2ca685e33371374dadbc3041046acb4999073a90d1a7d8d4346dde3e9ed79d6f6448aead28420227b0057cc
7
+ data.tar.gz: 8f978c0061d29706363b8c8dc63efee5d7cc9403ca2482173b09ba11bed256e1c2e7255d5adbeb40407648ff5efd19f6095131582c6944d5a625ecbbeabf3afe
data/README.md CHANGED
@@ -35,3 +35,32 @@ simhash2 = Simhash.generate(str2) # => 13921220612431195624
35
35
  Simhash.hamming_distance(simhash1, simhash2) # => 8
36
36
  ```
37
37
 
38
+ ## Performance
39
+
40
+ Thanks to some performance optimizations by [JayTeeSF](https://github.com/JayTeeSF), this gem generally performs better than `bookmate/simhash`, especially when working with longer strings with lots of tokens.
41
+
42
+ ```ruby
43
+ test_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
44
+
45
+ def test_simhash (x)
46
+ x.simhash # bookmate/simhash
47
+ end
48
+
49
+ def test_simhash2 (x)
50
+ Simhash.generate(x) # this gem
51
+ end
52
+
53
+ n = 5000
54
+ Benchmark.bm do |x|
55
+ x.report("simhash") { for i in 1..n; test_simhash(test_str); end }
56
+ x.report("simhash2") { for i in 1..n; test_simhash2(test_str); end }
57
+ end
58
+ ```
59
+
60
+ Results:
61
+
62
+ ```
63
+ user system total real
64
+ simhash 5.109375 0.093750 5.203125 ( 5.199069)
65
+ simhash2 4.109375 0.000000 4.109375 ( 4.108586)
66
+ ```
data/lib/simhash2.rb CHANGED
@@ -1,4 +1,4 @@
1
- require 'simhash2/version'
1
+ require_relative 'simhash2/version'
2
2
 
3
3
  module Simhash
4
4
  extend self
@@ -17,19 +17,20 @@ module Simhash
17
17
  end
18
18
 
19
19
  def generate(str, options = {})
20
+ # the split is how we get our tokens (or shingles)
21
+ # adjust that, if we want to use shingles
20
22
  generate_from_tokens(str.split(/\s+/), options)
21
23
  end
22
24
 
23
25
  def generate_from_tokens(tokens, options = {})
24
- filter_tokens(tokens, OPTIONS.merge(options))
25
-
26
26
  v = [0] * HASHBITS
27
-
28
27
  masks = v.dup
29
28
  masks.each_with_index { |_e, i| masks[i] = (1 << i) }
30
29
 
31
- hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
32
- hashes.each do |h|
30
+ filter_tokens(tokens, OPTIONS.merge(options)) do |token|
31
+ h = simple_string_hash(token, HASHBITS)
32
+ #warn "simple_string_hash (for: #{token.inspect}): #{h.inspect}"
33
+
33
34
  HASHBITS.times do |i|
34
35
  v[i] += (h & masks[i]).zero? ? -1 : +1
35
36
  end
@@ -65,12 +66,28 @@ module Simhash
65
66
  x.to_i
66
67
  end
67
68
 
68
- def filter_tokens(tokens, options)
69
- tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
70
- tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
71
- tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
72
- tokens.map!(&:stem) if options[:stemming]
73
- tokens.uniq! if options[:unique]
69
+ def filter_tokens(tokens, options, &block)
70
+ altered_tokens = []
71
+ tokens.each do |e|
72
+ new_e = e.downcase.gsub(/\W+/, '')
73
+ next if new_e.nil? || new_e.length < options[:min_token_length]
74
+ if options[:stop_words] && !options[:stop_words].empty?
75
+ next if options[:stop_words].include?(new_e)
76
+ end
77
+ if options[:stemming]
78
+ altered_tokens << new_e.stem
79
+ else
80
+ altered_tokens << new_e
81
+ end
82
+ end
83
+ altered_tokens.uniq! if options[:unique]
84
+
85
+ if block_given?
86
+ altered_tokens.each {|e| block[e] }
87
+ else
88
+ tokens.clear
89
+ altered_tokens.each {|e| tokens << e }
90
+ tokens
91
+ end
74
92
  end
75
-
76
93
  end
@@ -1,3 +1,3 @@
1
1
  module Simhash
2
- VERSION = '0.0.3'.freeze
2
+ VERSION = '0.0.4'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Wong
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-01 00:00:00.000000000 Z
11
+ date: 2018-10-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  version: '0'
80
80
  requirements: []
81
81
  rubyforge_project:
82
- rubygems_version: 2.6.11
82
+ rubygems_version: 2.7.6
83
83
  signing_key:
84
84
  specification_version: 4
85
85
  summary: A rewrite of the 'simhash' gem, which is an implementation of Moses Charikar's