simhash 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,28 @@
1
+
2
+ LICENSE
3
+
4
+ The MIT License
5
+
6
+ Copyright (c) 2002 Charikar, Simhash algorythm
7
+ Copyright (c) 2009 Andre Hagenbruch, Python implementation
8
+ Copyright (c) 2010 Bookmate.ru
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in
18
+ all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26
+ THE SOFTWARE.
27
+
28
+
data/README ADDED
@@ -0,0 +1 @@
1
+ Implementation of Charikar simhashes in Ruby
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+
5
+ $LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
6
+ require 'simhash'
7
+
8
+ desc 'Default: run unit tests.'
9
+ task :default => [:test]
10
+
11
+ desc 'Test the simhash gem'
12
+ Rake::TestTask.new(:test) do |t|
13
+ t.libs << 'lib'
14
+ t.pattern = 'test/**/*_test.rb'
15
+ t.verbose = true
16
+ end
17
+
18
+ desc 'Start an IRB session with all necessary files required.'
19
+ task :shell do |t|
20
+ chdir File.dirname(__FILE__)
21
+ exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems -r init'
22
+ end
23
+
24
+ desc 'Build the gemspec.'
25
+ task :gemspec do |t|
26
+ exec 'gem build simhash.gemspec'
27
+ end
28
+
29
+ desc "Print a list of the files to be put into the gem"
30
+ task :manifest do
31
+ spec.files.each do |file|
32
+ puts file
33
+ end
34
+ end
35
+
36
+ desc "Generate a gemspec file for GitHub"
37
+ task :gemspec do
38
+ File.open("#{spec.name}.gemspec", 'w') do |f|
39
+ f.write spec.to_ruby
40
+ end
41
+ end
42
+
43
+ desc "Build the gem into the current directory"
44
+ task :gem => :gemspec do
45
+ `gem build #{spec.name}.gemspec`
46
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), "lib", "simhash")
2
+ require File.join(File.dirname(__FILE__), "lib", "string")
3
+ require File.join(File.dirname(__FILE__), "lib", "integer")
data/lib/integer.rb ADDED
@@ -0,0 +1,16 @@
1
+ class Integer
2
+ # Hamming distance – number of different bits in same positions
3
+ # H(1001, 1110) = 3
4
+ # H(1001, 1000) = 1
5
+ def hamming_distance_to(integer)
6
+ total = 0
7
+ difference = self ^ integer
8
+
9
+ while difference > 0 do
10
+ total += 1 if (difference & 1).nonzero?
11
+ difference >>= 1
12
+ end
13
+
14
+ total
15
+ end
16
+ end
data/lib/simhash.rb ADDED
@@ -0,0 +1,56 @@
1
+ class Simhash
2
+ attr_accessor :hashbits, :hash
3
+ def initialize(tokens='', hashbits=128)
4
+ self.hashbits = hashbits
5
+ self.hash = self.simhash(tokens)
6
+ hash
7
+ end
8
+
9
+ def to_s
10
+ self.hash.to_s
11
+ end
12
+
13
+ def to_i
14
+ self.hash.to_i
15
+ end
16
+
17
+ def simhash(tokens)
18
+ v = [0] * self.hashbits
19
+ masks = v.dup
20
+ masks.each_with_index {|e, i| masks[i] = (1 << i)}
21
+ tokens.each do |token|
22
+ hashed_token = token.hash_wl(self.hashbits)
23
+ bitmask = 0
24
+ self.hashbits.times do |i|
25
+ v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
26
+ end
27
+ end
28
+
29
+ fingerprint = 0
30
+
31
+ self.hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
32
+
33
+ fingerprint
34
+ end
35
+
36
+ def hamming_distance(other_hash)
37
+ self.hash.hamming_distance_to(other_hash.hash)
38
+ end
39
+
40
+ def string_hash(string)
41
+ if string == ""
42
+ return 0
43
+ else
44
+ x = string[0] << 7
45
+ m = 1000003
46
+ mask = (1<<self.hashbits) - 1
47
+ string.each_byte do |char|
48
+ x = ((x * m) ^ char) & mask
49
+ end
50
+
51
+ x ^= string.size
52
+ x = -2 if x == -1
53
+ x
54
+ end
55
+ end
56
+ end
data/lib/string.rb ADDED
@@ -0,0 +1,19 @@
1
+ class String
2
+ def simhash(hashbits=128)
3
+ Simhash.new(self.split, hashbits)
4
+ end
5
+
6
+ # string hash of predefined length
7
+ def hash_wl(length)
8
+ return 0 if self == ""
9
+
10
+ x = self[0] << 7
11
+ m = 1000003
12
+ mask = (1<<length) - 1
13
+ self.each_byte{ |char| x = ((x * m) ^ char) & mask }
14
+
15
+ x ^= self.size
16
+ x = -2 if x == -1
17
+ x
18
+ end
19
+ end
data/rails/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), "..", "lib", "simhash")
2
+ require File.join(File.dirname(__FILE__), "..", "lib", "string")
3
+ require File.join(File.dirname(__FILE__), "..", "lib", "integer")
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhash
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Alex Gusev
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-11 00:00:00 +04:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Implementation of Charikar simhashes in Ruby
23
+ email: alex.gusev@bookmate.ru
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - README
32
+ - LICENSE
33
+ - Rakefile
34
+ - init.rb
35
+ - lib/integer.rb
36
+ - lib/simhash.rb
37
+ - lib/string.rb
38
+ - rails/init.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/bookmate/simhash
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project: simhash
69
+ rubygems_version: 1.3.7
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: "Gives you possbility to convert string into simhashes to futher use: finding near-duplicates, similar strings, etc."
73
+ test_files: []
74
+