mort666_simhash 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/lib/{integer.rb → simhash/integer.rb} +0 -0
- data/lib/{string.rb → simhash/string.rb} +3 -3
- data/lib/simhash.rb +24 -24
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e7b5060a56cd39db1a85ff2bca91022d8279742f
|
4
|
+
data.tar.gz: 1501e136b7d5c32e11580f42d13ce5cae8a38682
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f851149712bb5384d1657b3c87cfa90fc81ae0775118a2651ccd710016f980d1377685f55d8a9bc5210204bbbf860e206c80896996376da8ff669833e924d39
|
7
|
+
data.tar.gz: 8c3eee0868acdbf881cf61727a82388f17509bb91c57de6926d0fe409ca4734e3f51aa309a976438a15c9bd095e4205b73e4a5dd2d7863ac6587679f1d7e1875
|
data/Rakefile
CHANGED
@@ -23,7 +23,7 @@ end
|
|
23
23
|
desc 'Start an IRB session with all necessary files required.'
|
24
24
|
task :shell do |t|
|
25
25
|
chdir File.dirname(__FILE__)
|
26
|
-
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
|
26
|
+
exec 'irb -I lib/ -I lib/simhash -I lib/simhash/string -I lib/simhash/integer -r rubygems'
|
27
27
|
end
|
28
28
|
|
29
29
|
desc 'Build the gemspec.'
|
File without changes
|
@@ -2,7 +2,7 @@ class String
|
|
2
2
|
def simhash(options={})
|
3
3
|
split_by = options.delete(:split_by) || " "
|
4
4
|
|
5
|
-
# Do the punctuation clean before the split..
|
5
|
+
# Do the punctuation clean before the split..
|
6
6
|
# You could argue this is not preserving the meaning doing so actually preserves the edge case where a hyphen is removed
|
7
7
|
# resulting hash does not match the same string with a space in there before the split
|
8
8
|
if options[:preserve_punctuation]
|
@@ -11,7 +11,7 @@ class String
|
|
11
11
|
Simhash.hash(self.gsub(Simhash::PUNCTUATION_REGEXP, ' ') .split(split_by), options)
|
12
12
|
end
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def hash_vl_rb(length)
|
16
16
|
return 0 if self == ""
|
17
17
|
|
@@ -25,4 +25,4 @@ class String
|
|
25
25
|
x
|
26
26
|
end
|
27
27
|
|
28
|
-
end
|
28
|
+
end
|
data/lib/simhash.rb
CHANGED
@@ -3,23 +3,23 @@
|
|
3
3
|
require 'active_support/core_ext/string/multibyte'
|
4
4
|
require 'unicode'
|
5
5
|
|
6
|
-
require 'string'
|
7
|
-
require 'integer'
|
6
|
+
require 'simhash/string'
|
7
|
+
require 'simhash/integer'
|
8
8
|
require 'simhash/stopwords'
|
9
9
|
|
10
|
-
begin
|
10
|
+
#begin
|
11
11
|
require 'string_hashing'
|
12
|
-
rescue LoadError
|
13
|
-
end
|
12
|
+
#rescue LoadError
|
13
|
+
#end
|
14
14
|
|
15
|
-
module Simhash
|
15
|
+
module Simhash
|
16
16
|
DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
|
17
17
|
PUNCTUATION_REGEXP = if RUBY_VERSION >= "1.9"
|
18
18
|
/(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u
|
19
19
|
else
|
20
20
|
/(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
# Compare calculates the Hamming distance between two 64-bit integers
|
24
24
|
#
|
25
25
|
# Currently, this is calculated using the Kernighan method [1]. Other methods
|
@@ -40,25 +40,25 @@ module Simhash
|
|
40
40
|
def self.hash(tokens, options={})
|
41
41
|
hashbits = options[:hashbits] || 64
|
42
42
|
hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
|
43
|
-
|
43
|
+
|
44
44
|
v = [0] * hashbits
|
45
45
|
masks = v.dup
|
46
46
|
masks.each_with_index {|e, i| masks[i] = (1 << i)}
|
47
|
-
|
47
|
+
|
48
48
|
self.each_filtered_token(tokens, options) do |token|
|
49
49
|
hashed_token = token.send(hashing_method, hashbits).to_i
|
50
50
|
hashbits.times do |i|
|
51
51
|
v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
|
52
52
|
end
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
fingerprint = 0
|
56
56
|
|
57
|
-
hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
|
58
|
-
|
59
|
-
fingerprint
|
57
|
+
hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
|
58
|
+
|
59
|
+
fingerprint
|
60
60
|
end
|
61
|
-
|
61
|
+
|
62
62
|
def self.each_filtered_token(tokens, options={})
|
63
63
|
token_min_size = options[:token_min_size].to_i
|
64
64
|
stop_sentenses = options[:stop_sentenses]
|
@@ -66,28 +66,28 @@ module Simhash
|
|
66
66
|
# cutting punctuation (\302\240 is unbreakable space)
|
67
67
|
# Moved up to string class
|
68
68
|
# token = token.gsub(PUNCTUATION_REGEXP, ' ') if !options[:preserve_punctuation]
|
69
|
-
|
69
|
+
|
70
70
|
token = Unicode::downcase(token.strip)
|
71
|
-
|
71
|
+
|
72
72
|
# cutting stop-words
|
73
73
|
token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
|
74
|
-
|
74
|
+
|
75
75
|
# cutting stop-sentenses
|
76
76
|
next if stop_sentenses && stop_sentenses.include?(" #{token} ")
|
77
|
-
|
77
|
+
|
78
78
|
next if token.size.zero? || token.mb_chars.size < token_min_size
|
79
|
-
|
80
|
-
yield token
|
79
|
+
|
80
|
+
yield token
|
81
81
|
end
|
82
82
|
end
|
83
|
-
|
83
|
+
|
84
84
|
def self.filtered_tokens(tokens, options={})
|
85
85
|
filtered_tokens = []
|
86
86
|
self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
|
87
|
-
filtered_tokens
|
87
|
+
filtered_tokens
|
88
88
|
end
|
89
|
-
|
89
|
+
|
90
90
|
def self.hm
|
91
91
|
@@string_hash_method
|
92
92
|
end
|
93
|
-
end
|
93
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mort666_simhash
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Gusev
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-08-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -109,12 +109,12 @@ files:
|
|
109
109
|
- Rakefile
|
110
110
|
- ext/string_hashing/extconf.rb
|
111
111
|
- ext/string_hashing/string_hashing.c
|
112
|
-
- lib/integer.rb
|
113
112
|
- lib/simhash.rb
|
113
|
+
- lib/simhash/integer.rb
|
114
114
|
- lib/simhash/stopwords.rb
|
115
115
|
- lib/simhash/stopwords/en.rb
|
116
116
|
- lib/simhash/stopwords/ru.rb
|
117
|
-
- lib/string.rb
|
117
|
+
- lib/simhash/string.rb
|
118
118
|
homepage: http://github.com/mort666/simhash
|
119
119
|
licenses: []
|
120
120
|
metadata: {}
|
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
version: '0'
|
135
135
|
requirements: []
|
136
136
|
rubyforge_project:
|
137
|
-
rubygems_version: 2.
|
137
|
+
rubygems_version: 2.6.12
|
138
138
|
signing_key:
|
139
139
|
specification_version: 4
|
140
140
|
summary: 'Gives you possbility to convert string into simhashes to futher use: finding
|