simhash 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,28 @@
1
+
2
+ LICENSE
3
+
4
+ The MIT License
5
+
6
+ Copyright (c) 2002 Charikar, Simhash algorythm
7
+ Copyright (c) 2009 Andre Hagenbruch, Python implementation
8
+ Copyright (c) 2010 Bookmate.ru
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in
18
+ all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26
+ THE SOFTWARE.
27
+
28
+
data/README ADDED
@@ -0,0 +1 @@
1
+ Implementation of Charikar simhashes in Ruby
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+
5
+ $LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
6
+ require 'simhash'
7
+
8
+ desc 'Default: run unit tests.'
9
+ task :default => [:test]
10
+
11
+ desc 'Test the simhash gem'
12
+ Rake::TestTask.new(:test) do |t|
13
+ t.libs << 'lib'
14
+ t.pattern = 'test/**/*_test.rb'
15
+ t.verbose = true
16
+ end
17
+
18
+ desc 'Start an IRB session with all necessary files required.'
19
+ task :shell do |t|
20
+ chdir File.dirname(__FILE__)
21
+ exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems -r init'
22
+ end
23
+
24
+ desc 'Build the gemspec.'
25
+ task :gemspec do |t|
26
+ exec 'gem build simhash.gemspec'
27
+ end
28
+
29
+ desc "Print a list of the files to be put into the gem"
30
+ task :manifest do
31
+ spec.files.each do |file|
32
+ puts file
33
+ end
34
+ end
35
+
36
+ desc "Generate a gemspec file for GitHub"
37
+ task :gemspec do
38
+ File.open("#{spec.name}.gemspec", 'w') do |f|
39
+ f.write spec.to_ruby
40
+ end
41
+ end
42
+
43
+ desc "Build the gem into the current directory"
44
+ task :gem => :gemspec do
45
+ `gem build #{spec.name}.gemspec`
46
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), "lib", "simhash")
2
+ require File.join(File.dirname(__FILE__), "lib", "string")
3
+ require File.join(File.dirname(__FILE__), "lib", "integer")
data/lib/integer.rb ADDED
@@ -0,0 +1,16 @@
1
+ class Integer
2
+ # Hamming distance – number of different bits in same positions
3
+ # H(1001, 1110) = 3
4
+ # H(1001, 1000) = 1
5
+ def hamming_distance_to(integer)
6
+ total = 0
7
+ difference = self ^ integer
8
+
9
+ while difference > 0 do
10
+ total += 1 if (difference & 1).nonzero?
11
+ difference >>= 1
12
+ end
13
+
14
+ total
15
+ end
16
+ end
data/lib/simhash.rb ADDED
@@ -0,0 +1,56 @@
1
+ class Simhash
2
+ attr_accessor :hashbits, :hash
3
+ def initialize(tokens='', hashbits=128)
4
+ self.hashbits = hashbits
5
+ self.hash = self.simhash(tokens)
6
+ hash
7
+ end
8
+
9
+ def to_s
10
+ self.hash.to_s
11
+ end
12
+
13
+ def to_i
14
+ self.hash.to_i
15
+ end
16
+
17
+ def simhash(tokens)
18
+ v = [0] * self.hashbits
19
+ masks = v.dup
20
+ masks.each_with_index {|e, i| masks[i] = (1 << i)}
21
+ tokens.each do |token|
22
+ hashed_token = token.hash_wl(self.hashbits)
23
+ bitmask = 0
24
+ self.hashbits.times do |i|
25
+ v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
26
+ end
27
+ end
28
+
29
+ fingerprint = 0
30
+
31
+ self.hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
32
+
33
+ fingerprint
34
+ end
35
+
36
+ def hamming_distance(other_hash)
37
+ self.hash.hamming_distance_to(other_hash.hash)
38
+ end
39
+
40
+ def string_hash(string)
41
+ if string == ""
42
+ return 0
43
+ else
44
+ x = string[0] << 7
45
+ m = 1000003
46
+ mask = (1<<self.hashbits) - 1
47
+ string.each_byte do |char|
48
+ x = ((x * m) ^ char) & mask
49
+ end
50
+
51
+ x ^= string.size
52
+ x = -2 if x == -1
53
+ x
54
+ end
55
+ end
56
+ end
data/lib/string.rb ADDED
@@ -0,0 +1,19 @@
1
+ class String
2
+ def simhash(hashbits=128)
3
+ Simhash.new(self.split, hashbits)
4
+ end
5
+
6
+ # string hash of predefined length
7
+ def hash_wl(length)
8
+ return 0 if self == ""
9
+
10
+ x = self[0] << 7
11
+ m = 1000003
12
+ mask = (1<<length) - 1
13
+ self.each_byte{ |char| x = ((x * m) ^ char) & mask }
14
+
15
+ x ^= self.size
16
+ x = -2 if x == -1
17
+ x
18
+ end
19
+ end
data/rails/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), "..", "lib", "simhash")
2
+ require File.join(File.dirname(__FILE__), "..", "lib", "string")
3
+ require File.join(File.dirname(__FILE__), "..", "lib", "integer")
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhash
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Alex Gusev
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-11 00:00:00 +04:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Implementation of Charikar simhashes in Ruby
23
+ email: alex.gusev@bookmate.ru
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - README
32
+ - LICENSE
33
+ - Rakefile
34
+ - init.rb
35
+ - lib/integer.rb
36
+ - lib/simhash.rb
37
+ - lib/string.rb
38
+ - rails/init.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/bookmate/simhash
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project: simhash
69
+ rubygems_version: 1.3.7
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: "Gives you possbility to convert string into simhashes to futher use: finding near-duplicates, similar strings, etc."
73
+ test_files: []
74
+