mort666_simhash 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3b3b62e04b6e50547de7e7edf3441da79ce41869
4
- data.tar.gz: dde5d235ee1d2bbeaa9304fe235768883c6525cf
3
+ metadata.gz: e7b5060a56cd39db1a85ff2bca91022d8279742f
4
+ data.tar.gz: 1501e136b7d5c32e11580f42d13ce5cae8a38682
5
5
  SHA512:
6
- metadata.gz: c333facb22f608bf1caec2de3023f30ccc552f116a323063819dfcfe2bc49d1bc397a3f930d3ac8caa4e19d1f6308542096b4732e5ae308a9c1dc241083674ee
7
- data.tar.gz: 5c1ef0e033e82009858a7136e23c01936ce7083c551783c8987f36a31a294e0a1babdc82e2bd3a943c60a444bb724fe09affc2b1998495b2172dd8d58b1e06cf
6
+ metadata.gz: 9f851149712bb5384d1657b3c87cfa90fc81ae0775118a2651ccd710016f980d1377685f55d8a9bc5210204bbbf860e206c80896996376da8ff669833e924d39
7
+ data.tar.gz: 8c3eee0868acdbf881cf61727a82388f17509bb91c57de6926d0fe409ca4734e3f51aa309a976438a15c9bd095e4205b73e4a5dd2d7863ac6587679f1d7e1875
data/Rakefile CHANGED
@@ -23,7 +23,7 @@ end
23
23
  desc 'Start an IRB session with all necessary files required.'
24
24
  task :shell do |t|
25
25
  chdir File.dirname(__FILE__)
26
- exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
26
+ exec 'irb -I lib/ -I lib/simhash -I lib/simhash/string -I lib/simhash/integer -r rubygems'
27
27
  end
28
28
 
29
29
  desc 'Build the gemspec.'
File without changes
@@ -2,7 +2,7 @@ class String
2
2
  def simhash(options={})
3
3
  split_by = options.delete(:split_by) || " "
4
4
 
5
- # Do the punctuation clean before the split..
5
+ # Do the punctuation clean before the split..
6
6
  # You could argue this is not preserving the meaning doing so actually preserves the edge case where a hyphen is removed
7
7
  # resulting hash does not match the same string with a space in there before the split
8
8
  if options[:preserve_punctuation]
@@ -11,7 +11,7 @@ class String
11
11
  Simhash.hash(self.gsub(Simhash::PUNCTUATION_REGEXP, ' ') .split(split_by), options)
12
12
  end
13
13
  end
14
-
14
+
15
15
  def hash_vl_rb(length)
16
16
  return 0 if self == ""
17
17
 
@@ -25,4 +25,4 @@ class String
25
25
  x
26
26
  end
27
27
 
28
- end
28
+ end
data/lib/simhash.rb CHANGED
@@ -3,23 +3,23 @@
3
3
  require 'active_support/core_ext/string/multibyte'
4
4
  require 'unicode'
5
5
 
6
- require 'string'
7
- require 'integer'
6
+ require 'simhash/string'
7
+ require 'simhash/integer'
8
8
  require 'simhash/stopwords'
9
9
 
10
- begin
10
+ #begin
11
11
  require 'string_hashing'
12
- rescue LoadError
13
- end
12
+ #rescue LoadError
13
+ #end
14
14
 
15
- module Simhash
15
+ module Simhash
16
16
  DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
17
17
  PUNCTUATION_REGEXP = if RUBY_VERSION >= "1.9"
18
18
  /(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u
19
19
  else
20
20
  /(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u
21
21
  end
22
-
22
+
23
23
  # Compare calculates the Hamming distance between two 64-bit integers
24
24
  #
25
25
  # Currently, this is calculated using the Kernighan method [1]. Other methods
@@ -40,25 +40,25 @@ module Simhash
40
40
  def self.hash(tokens, options={})
41
41
  hashbits = options[:hashbits] || 64
42
42
  hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
43
-
43
+
44
44
  v = [0] * hashbits
45
45
  masks = v.dup
46
46
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
47
-
47
+
48
48
  self.each_filtered_token(tokens, options) do |token|
49
49
  hashed_token = token.send(hashing_method, hashbits).to_i
50
50
  hashbits.times do |i|
51
51
  v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
52
52
  end
53
53
  end
54
-
54
+
55
55
  fingerprint = 0
56
56
 
57
- hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
58
-
59
- fingerprint
57
+ hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
58
+
59
+ fingerprint
60
60
  end
61
-
61
+
62
62
  def self.each_filtered_token(tokens, options={})
63
63
  token_min_size = options[:token_min_size].to_i
64
64
  stop_sentenses = options[:stop_sentenses]
@@ -66,28 +66,28 @@ module Simhash
66
66
  # cutting punctuation (\302\240 is unbreakable space)
67
67
  # Moved up to string class
68
68
  # token = token.gsub(PUNCTUATION_REGEXP, ' ') if !options[:preserve_punctuation]
69
-
69
+
70
70
  token = Unicode::downcase(token.strip)
71
-
71
+
72
72
  # cutting stop-words
73
73
  token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
74
-
74
+
75
75
  # cutting stop-sentenses
76
76
  next if stop_sentenses && stop_sentenses.include?(" #{token} ")
77
-
77
+
78
78
  next if token.size.zero? || token.mb_chars.size < token_min_size
79
-
80
- yield token
79
+
80
+ yield token
81
81
  end
82
82
  end
83
-
83
+
84
84
  def self.filtered_tokens(tokens, options={})
85
85
  filtered_tokens = []
86
86
  self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
87
- filtered_tokens
87
+ filtered_tokens
88
88
  end
89
-
89
+
90
90
  def self.hm
91
91
  @@string_hash_method
92
92
  end
93
- end
93
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mort666_simhash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Gusev
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-09-26 00:00:00.000000000 Z
12
+ date: 2017-08-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -109,12 +109,12 @@ files:
109
109
  - Rakefile
110
110
  - ext/string_hashing/extconf.rb
111
111
  - ext/string_hashing/string_hashing.c
112
- - lib/integer.rb
113
112
  - lib/simhash.rb
113
+ - lib/simhash/integer.rb
114
114
  - lib/simhash/stopwords.rb
115
115
  - lib/simhash/stopwords/en.rb
116
116
  - lib/simhash/stopwords/ru.rb
117
- - lib/string.rb
117
+ - lib/simhash/string.rb
118
118
  homepage: http://github.com/mort666/simhash
119
119
  licenses: []
120
120
  metadata: {}
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  version: '0'
135
135
  requirements: []
136
136
  rubyforge_project:
137
- rubygems_version: 2.4.3
137
+ rubygems_version: 2.6.12
138
138
  signing_key:
139
139
  specification_version: 4
140
140
  summary: 'Gives you possbility to convert string into simhashes to futher use: finding