simhash 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +53 -0
- data/Rakefile +4 -1
- data/ext/string_hashing/extconf.rb +48 -0
- data/ext/string_hashing/string_hashing.c +63 -0
- data/lib/simhash.rb +18 -6
- data/lib/string.rb +3 -4
- metadata +26 -11
- data/README +0 -1
- data/init.rb +0 -3
- data/rails/init.rb +0 -3
data/README.rdoc
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
==Absctract
|
2
|
+
|
3
|
+
This is implementation of {Moses Charikar's simhashes}[http://portal.acm.org/citation.cfm?id=509965] in Ruby.
|
4
|
+
|
5
|
+
==Usage
|
6
|
+
|
7
|
+
When you have a string and want to calculate it's simhash, you should
|
8
|
+
|
9
|
+
my_string.simhash
|
10
|
+
|
11
|
+
By default it will generate 64-bit integer - that is simhash for this string
|
12
|
+
|
13
|
+
It's always better to tokenize string before simhashing. It's as simple as
|
14
|
+
|
15
|
+
my_string.simhash(:split_by => / /)
|
16
|
+
|
17
|
+
This will generate 64-bit integer based, but will split string into words before.
|
18
|
+
It's handy when you need to calculate similarity of strings based on word usage.
|
19
|
+
You can split string as you like: by letters/sentences/specific letter-combinations, etc.
|
20
|
+
|
21
|
+
my_string.simhash(:split_by => /./, :bitlength => 512)
|
22
|
+
|
23
|
+
Sometimes you might need longer simhash (finding similarity for very long strings is a good example).
|
24
|
+
You can set length of result hash by passing bitlength parameter. This example will return 512-bit simhash
|
25
|
+
for your string splitted by sentences.
|
26
|
+
|
27
|
+
==Advanced usage
|
28
|
+
|
29
|
+
It's useful to clean your string before simhashing. But it's useful not to clean, too.
|
30
|
+
|
31
|
+
Here are examples:
|
32
|
+
|
33
|
+
my_string.simhash(:stop_words => true) # here we clean
|
34
|
+
|
35
|
+
This will find stop-words in your string and remove them before simhashing. Stop-words are "the", "not", "about", etc.
|
36
|
+
Currently we remove only Russian and English stop-words.
|
37
|
+
|
38
|
+
my_string.simhash(:preserve_punctuation => true) # here we not
|
39
|
+
|
40
|
+
This will not remove punctuation before simhashing. Yes, we remove all dots, commas, etc. after splitting string to words by default.
|
41
|
+
Because different punctiation does not mean difference in general. If you not agree you can turn this default off.
|
42
|
+
|
43
|
+
==Installation
|
44
|
+
|
45
|
+
As usual:
|
46
|
+
|
47
|
+
gem install simhash
|
48
|
+
|
49
|
+
But if you have {GNU MP library}[http://gmplib.org/], simhash will work faster! To check out which version is used, type:
|
50
|
+
|
51
|
+
Simhash::DEFAULT_STRING_HASH_METHOD
|
52
|
+
|
53
|
+
It should return symbol. If symbol ends with "rb", your simhash is slow. If you want make it faster, install GNU MP.
|
data/Rakefile
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rake'
|
2
2
|
require 'rake/testtask'
|
3
3
|
require 'rake/rdoctask'
|
4
|
+
require 'rake/extensiontask'
|
4
5
|
|
5
6
|
$LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
|
6
7
|
require 'simhash'
|
@@ -8,6 +9,8 @@ require 'simhash'
|
|
8
9
|
desc 'Default: run unit tests.'
|
9
10
|
task :default => [:test]
|
10
11
|
|
12
|
+
Rake::ExtensionTask.new('string_hashing')
|
13
|
+
|
11
14
|
desc 'Test the simhash gem'
|
12
15
|
Rake::TestTask.new(:test) do |t|
|
13
16
|
t.libs << 'lib'
|
@@ -18,7 +21,7 @@ end
|
|
18
21
|
desc 'Start an IRB session with all necessary files required.'
|
19
22
|
task :shell do |t|
|
20
23
|
chdir File.dirname(__FILE__)
|
21
|
-
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems
|
24
|
+
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
|
22
25
|
end
|
23
26
|
|
24
27
|
desc 'Build the gemspec.'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
extension_name = 'string_hashing'
|
4
|
+
# should link against the libgmp library
|
5
|
+
$LDFLAGS << ' -lgmp'
|
6
|
+
|
7
|
+
# Sort out the universal vs. single-archicture build problems on MacOS X
|
8
|
+
if RUBY_PLATFORM.include?( 'darwin' )
|
9
|
+
puts "MacOS X build: fixing architecture flags:"
|
10
|
+
|
11
|
+
commonflags = nil
|
12
|
+
if ENV['ARCHFLAGS']
|
13
|
+
puts " using the value in ARCHFLAGS environment variable (%p)." % [ ENV['ARCHFLAGS'] ]
|
14
|
+
commonflags = ENV['ARCHFLAGS']
|
15
|
+
else
|
16
|
+
$stderr.puts %{
|
17
|
+
=========== WARNING ===========
|
18
|
+
|
19
|
+
You are building this extension on OS X without setting the
|
20
|
+
ARCHFLAGS environment variable.
|
21
|
+
|
22
|
+
If you are seeing this message, that means that the
|
23
|
+
build will probably fail.
|
24
|
+
|
25
|
+
===================================
|
26
|
+
}.gsub( /^\t+/, ' ' )
|
27
|
+
end
|
28
|
+
|
29
|
+
if commonflags
|
30
|
+
$CFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
31
|
+
$LDFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
32
|
+
CONFIG['LDSHARED'].gsub!( /-arch\s+\S+ /, '' )
|
33
|
+
|
34
|
+
$CFLAGS << ' ' << commonflags
|
35
|
+
$LDFLAGS << ' ' << commonflags
|
36
|
+
CONFIG['LDSHARED'] << ' ' << commonflags
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if find_header("gmp.h")
|
41
|
+
$stderr.puts "Configuring extensions"
|
42
|
+
dir_config(extension_name)
|
43
|
+
create_makefile(extension_name)
|
44
|
+
else
|
45
|
+
$stderr.puts "Skipping building of C extension"
|
46
|
+
# creating foo Makefile to avoid building stuff
|
47
|
+
File.open(File.join(File.dirname(__FILE__), "Makefile"), "w"){|f| f.write("all: \ninstall: \n")}
|
48
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <gmp.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
|
5
|
+
VALUE StringHashing = Qnil;
|
6
|
+
|
7
|
+
void Init_string_hashing();
|
8
|
+
|
9
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength);
|
10
|
+
|
11
|
+
void Init_string_hashing() {
|
12
|
+
rb_define_method(rb_cString, "hash_vl", method_hash_vl, 1);
|
13
|
+
}
|
14
|
+
|
15
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength) {
|
16
|
+
int bl = NUM2INT(bitlength);
|
17
|
+
|
18
|
+
// for hard typecasting
|
19
|
+
unsigned char one_char;
|
20
|
+
char* result;
|
21
|
+
result = malloc(bl*sizeof(char));
|
22
|
+
unsigned long long len = RSTRING(self)->len;
|
23
|
+
char *string = RSTRING(self)->ptr;
|
24
|
+
|
25
|
+
if(len == 0){ return 0; }
|
26
|
+
|
27
|
+
mpz_t x, mask, long_len;
|
28
|
+
mpz_init_set_ui (long_len, len);
|
29
|
+
one_char = RSTRING(self)->ptr[0];
|
30
|
+
mpz_init_set_ui (x, one_char << 7);
|
31
|
+
int m = 1000003;
|
32
|
+
|
33
|
+
// generating mask of length bitlength filled with 1
|
34
|
+
mpz_init (mask);
|
35
|
+
mpz_ui_pow_ui(mask, 2, bl);
|
36
|
+
mpz_sub_ui (mask, mask, 1);
|
37
|
+
|
38
|
+
mpz_t computations, byte;
|
39
|
+
mpz_init(computations);
|
40
|
+
mpz_init2 (byte, 8);
|
41
|
+
|
42
|
+
int i = 0;
|
43
|
+
for(i; i < len; i++) {
|
44
|
+
one_char = string[i];
|
45
|
+
mpz_set_ui(byte, one_char);
|
46
|
+
mpz_mul_ui(computations, x, m);
|
47
|
+
mpz_xor(computations, computations, byte);
|
48
|
+
mpz_and (x, mask, computations);
|
49
|
+
}
|
50
|
+
|
51
|
+
mpz_xor(x, x, long_len);
|
52
|
+
mpz_get_str (result, 10, x);
|
53
|
+
VALUE res = rb_str_new2(result);
|
54
|
+
|
55
|
+
mpz_clear(x);
|
56
|
+
mpz_clear(byte);
|
57
|
+
mpz_clear(computations);
|
58
|
+
mpz_clear(mask);
|
59
|
+
mpz_clear(long_len);
|
60
|
+
free(result);
|
61
|
+
|
62
|
+
return res;
|
63
|
+
}
|
data/lib/simhash.rb
CHANGED
@@ -1,11 +1,21 @@
|
|
1
1
|
$KCODE = 'u'
|
2
|
-
require '
|
3
|
-
require
|
2
|
+
require 'unicode'
|
3
|
+
require 'string'
|
4
|
+
require 'integer'
|
5
|
+
require 'simhash/stopwords'
|
6
|
+
begin
|
7
|
+
require 'string_hashing'
|
8
|
+
rescue LoadError
|
9
|
+
end
|
10
|
+
|
4
11
|
|
5
12
|
module Simhash
|
13
|
+
DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
|
14
|
+
|
6
15
|
def self.hash(tokens, options={})
|
7
16
|
hashbits = options[:hashbits] || 64
|
8
17
|
token_min_size = options[:token_min_size].to_i
|
18
|
+
hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
|
9
19
|
|
10
20
|
v = [0] * hashbits
|
11
21
|
masks = v.dup
|
@@ -15,15 +25,13 @@ module Simhash
|
|
15
25
|
# cutting punctuation (\302\240 is unbreakable space)
|
16
26
|
token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
|
17
27
|
|
18
|
-
token = token.strip
|
28
|
+
token = Unicode::downcase(token.strip)
|
19
29
|
|
20
30
|
# cutting stop-words
|
21
31
|
token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
|
22
32
|
|
23
33
|
next if token.size.zero? || token.size < token_min_size
|
24
|
-
|
25
|
-
hashed_token = token.hash_wl(hashbits)
|
26
|
-
bitmask = 0
|
34
|
+
hashed_token = token.send(hashing_method, hashbits).to_i
|
27
35
|
hashbits.times do |i|
|
28
36
|
v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
|
29
37
|
end
|
@@ -35,4 +43,8 @@ module Simhash
|
|
35
43
|
|
36
44
|
fingerprint
|
37
45
|
end
|
46
|
+
|
47
|
+
def self.hm
|
48
|
+
@@string_hash_method
|
49
|
+
end
|
38
50
|
end
|
data/lib/string.rb
CHANGED
@@ -3,19 +3,18 @@ class String
|
|
3
3
|
split_by = options.delete(:split_by) || " "
|
4
4
|
Simhash.hash(self.split(split_by), options)
|
5
5
|
end
|
6
|
-
|
7
6
|
|
8
|
-
|
9
|
-
def hash_wl(length)
|
7
|
+
def hash_vl_rb(length)
|
10
8
|
return 0 if self == ""
|
11
9
|
|
12
10
|
x = self[0] << 7
|
13
11
|
m = 1000003
|
14
12
|
mask = (1<<length) - 1
|
15
13
|
self.each_byte{ |char| x = ((x * m) ^ char) & mask }
|
16
|
-
|
14
|
+
|
17
15
|
x ^= self.size
|
18
16
|
x = -2 if x == -1
|
19
17
|
x
|
20
18
|
end
|
19
|
+
|
21
20
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhash
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alex Gusev
|
@@ -15,30 +15,45 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-20 00:00:00 +04:00
|
19
19
|
default_executable:
|
20
|
-
dependencies:
|
21
|
-
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: unicode
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 17
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 3
|
33
|
+
- 1
|
34
|
+
version: 0.3.1
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
22
37
|
description: Implementation of Charikar simhashes in Ruby
|
23
38
|
email: alex.gusev@bookmate.ru
|
24
39
|
executables: []
|
25
40
|
|
26
|
-
extensions:
|
27
|
-
|
41
|
+
extensions:
|
42
|
+
- ext/string_hashing/extconf.rb
|
28
43
|
extra_rdoc_files: []
|
29
44
|
|
30
45
|
files:
|
31
|
-
- README
|
46
|
+
- README.rdoc
|
32
47
|
- LICENSE
|
33
48
|
- Rakefile
|
34
|
-
- init.rb
|
35
49
|
- lib/integer.rb
|
36
50
|
- lib/simhash/stopwords/en.rb
|
37
51
|
- lib/simhash/stopwords/ru.rb
|
38
52
|
- lib/simhash/stopwords.rb
|
39
53
|
- lib/simhash.rb
|
40
54
|
- lib/string.rb
|
41
|
-
-
|
55
|
+
- ext/string_hashing/extconf.rb
|
56
|
+
- ext/string_hashing/string_hashing.c
|
42
57
|
has_rdoc: true
|
43
58
|
homepage: http://github.com/bookmate/simhash
|
44
59
|
licenses: []
|
data/README
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Implementation of Charikar simhashes in Ruby
|
data/init.rb
DELETED
data/rails/init.rb
DELETED