mort666_simhash 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3b3b62e04b6e50547de7e7edf3441da79ce41869
4
- data.tar.gz: dde5d235ee1d2bbeaa9304fe235768883c6525cf
3
+ metadata.gz: e7b5060a56cd39db1a85ff2bca91022d8279742f
4
+ data.tar.gz: 1501e136b7d5c32e11580f42d13ce5cae8a38682
5
5
  SHA512:
6
- metadata.gz: c333facb22f608bf1caec2de3023f30ccc552f116a323063819dfcfe2bc49d1bc397a3f930d3ac8caa4e19d1f6308542096b4732e5ae308a9c1dc241083674ee
7
- data.tar.gz: 5c1ef0e033e82009858a7136e23c01936ce7083c551783c8987f36a31a294e0a1babdc82e2bd3a943c60a444bb724fe09affc2b1998495b2172dd8d58b1e06cf
6
+ metadata.gz: 9f851149712bb5384d1657b3c87cfa90fc81ae0775118a2651ccd710016f980d1377685f55d8a9bc5210204bbbf860e206c80896996376da8ff669833e924d39
7
+ data.tar.gz: 8c3eee0868acdbf881cf61727a82388f17509bb91c57de6926d0fe409ca4734e3f51aa309a976438a15c9bd095e4205b73e4a5dd2d7863ac6587679f1d7e1875
data/Rakefile CHANGED
@@ -23,7 +23,7 @@ end
23
23
  desc 'Start an IRB session with all necessary files required.'
24
24
  task :shell do |t|
25
25
  chdir File.dirname(__FILE__)
26
- exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
26
+ exec 'irb -I lib/ -I lib/simhash -I lib/simhash/string -I lib/simhash/integer -r rubygems'
27
27
  end
28
28
 
29
29
  desc 'Build the gemspec.'
File without changes
@@ -2,7 +2,7 @@ class String
2
2
  def simhash(options={})
3
3
  split_by = options.delete(:split_by) || " "
4
4
 
5
- # Do the punctuation clean before the split..
5
+ # Do the punctuation clean before the split..
6
6
  # You could argue this is not preserving the meaning doing so actually preserves the edge case where a hyphen is removed
7
7
  # resulting hash does not match the same string with a space in there before the split
8
8
  if options[:preserve_punctuation]
@@ -11,7 +11,7 @@ class String
11
11
  Simhash.hash(self.gsub(Simhash::PUNCTUATION_REGEXP, ' ') .split(split_by), options)
12
12
  end
13
13
  end
14
-
14
+
15
15
  def hash_vl_rb(length)
16
16
  return 0 if self == ""
17
17
 
@@ -25,4 +25,4 @@ class String
25
25
  x
26
26
  end
27
27
 
28
- end
28
+ end
data/lib/simhash.rb CHANGED
@@ -3,23 +3,23 @@
3
3
  require 'active_support/core_ext/string/multibyte'
4
4
  require 'unicode'
5
5
 
6
- require 'string'
7
- require 'integer'
6
+ require 'simhash/string'
7
+ require 'simhash/integer'
8
8
  require 'simhash/stopwords'
9
9
 
10
- begin
10
+ #begin
11
11
  require 'string_hashing'
12
- rescue LoadError
13
- end
12
+ #rescue LoadError
13
+ #end
14
14
 
15
- module Simhash
15
+ module Simhash
16
16
  DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
17
17
  PUNCTUATION_REGEXP = if RUBY_VERSION >= "1.9"
18
18
  /(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u
19
19
  else
20
20
  /(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u
21
21
  end
22
-
22
+
23
23
  # Compare calculates the Hamming distance between two 64-bit integers
24
24
  #
25
25
  # Currently, this is calculated using the Kernighan method [1]. Other methods
@@ -40,25 +40,25 @@ module Simhash
40
40
  def self.hash(tokens, options={})
41
41
  hashbits = options[:hashbits] || 64
42
42
  hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
43
-
43
+
44
44
  v = [0] * hashbits
45
45
  masks = v.dup
46
46
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
47
-
47
+
48
48
  self.each_filtered_token(tokens, options) do |token|
49
49
  hashed_token = token.send(hashing_method, hashbits).to_i
50
50
  hashbits.times do |i|
51
51
  v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
52
52
  end
53
53
  end
54
-
54
+
55
55
  fingerprint = 0
56
56
 
57
- hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
58
-
59
- fingerprint
57
+ hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
58
+
59
+ fingerprint
60
60
  end
61
-
61
+
62
62
  def self.each_filtered_token(tokens, options={})
63
63
  token_min_size = options[:token_min_size].to_i
64
64
  stop_sentenses = options[:stop_sentenses]
@@ -66,28 +66,28 @@ module Simhash
66
66
  # cutting punctuation (\302\240 is unbreakable space)
67
67
  # Moved up to string class
68
68
  # token = token.gsub(PUNCTUATION_REGEXP, ' ') if !options[:preserve_punctuation]
69
-
69
+
70
70
  token = Unicode::downcase(token.strip)
71
-
71
+
72
72
  # cutting stop-words
73
73
  token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
74
-
74
+
75
75
  # cutting stop-sentenses
76
76
  next if stop_sentenses && stop_sentenses.include?(" #{token} ")
77
-
77
+
78
78
  next if token.size.zero? || token.mb_chars.size < token_min_size
79
-
80
- yield token
79
+
80
+ yield token
81
81
  end
82
82
  end
83
-
83
+
84
84
  def self.filtered_tokens(tokens, options={})
85
85
  filtered_tokens = []
86
86
  self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
87
- filtered_tokens
87
+ filtered_tokens
88
88
  end
89
-
89
+
90
90
  def self.hm
91
91
  @@string_hash_method
92
92
  end
93
- end
93
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mort666_simhash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Gusev
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-09-26 00:00:00.000000000 Z
12
+ date: 2017-08-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -109,12 +109,12 @@ files:
109
109
  - Rakefile
110
110
  - ext/string_hashing/extconf.rb
111
111
  - ext/string_hashing/string_hashing.c
112
- - lib/integer.rb
113
112
  - lib/simhash.rb
113
+ - lib/simhash/integer.rb
114
114
  - lib/simhash/stopwords.rb
115
115
  - lib/simhash/stopwords/en.rb
116
116
  - lib/simhash/stopwords/ru.rb
117
- - lib/string.rb
117
+ - lib/simhash/string.rb
118
118
  homepage: http://github.com/mort666/simhash
119
119
  licenses: []
120
120
  metadata: {}
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  version: '0'
135
135
  requirements: []
136
136
  rubyforge_project:
137
- rubygems_version: 2.4.3
137
+ rubygems_version: 2.6.12
138
138
  signing_key:
139
139
  specification_version: 4
140
140
  summary: 'Gives you possbility to convert string into simhashes to futher use: finding