simhash 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +53 -0
- data/Rakefile +4 -1
- data/ext/string_hashing/extconf.rb +48 -0
- data/ext/string_hashing/string_hashing.c +63 -0
- data/lib/simhash.rb +18 -6
- data/lib/string.rb +3 -4
- metadata +26 -11
- data/README +0 -1
- data/init.rb +0 -3
- data/rails/init.rb +0 -3
data/README.rdoc
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
==Absctract
|
2
|
+
|
3
|
+
This is implementation of {Moses Charikar's simhashes}[http://portal.acm.org/citation.cfm?id=509965] in Ruby.
|
4
|
+
|
5
|
+
==Usage
|
6
|
+
|
7
|
+
When you have a string and want to calculate it's simhash, you should
|
8
|
+
|
9
|
+
my_string.simhash
|
10
|
+
|
11
|
+
By default it will generate 64-bit integer - that is simhash for this string
|
12
|
+
|
13
|
+
It's always better to tokenize string before simhashing. It's as simple as
|
14
|
+
|
15
|
+
my_string.simhash(:split_by => / /)
|
16
|
+
|
17
|
+
This will generate 64-bit integer based, but will split string into words before.
|
18
|
+
It's handy when you need to calculate similarity of strings based on word usage.
|
19
|
+
You can split string as you like: by letters/sentences/specific letter-combinations, etc.
|
20
|
+
|
21
|
+
my_string.simhash(:split_by => /./, :bitlength => 512)
|
22
|
+
|
23
|
+
Sometimes you might need longer simhash (finding similarity for very long strings is a good example).
|
24
|
+
You can set length of result hash by passing bitlength parameter. This example will return 512-bit simhash
|
25
|
+
for your string splitted by sentences.
|
26
|
+
|
27
|
+
==Advanced usage
|
28
|
+
|
29
|
+
It's useful to clean your string before simhashing. But it's useful not to clean, too.
|
30
|
+
|
31
|
+
Here are examples:
|
32
|
+
|
33
|
+
my_string.simhash(:stop_words => true) # here we clean
|
34
|
+
|
35
|
+
This will find stop-words in your string and remove them before simhashing. Stop-words are "the", "not", "about", etc.
|
36
|
+
Currently we remove only Russian and English stop-words.
|
37
|
+
|
38
|
+
my_string.simhash(:preserve_punctuation => true) # here we not
|
39
|
+
|
40
|
+
This will not remove punctuation before simhashing. Yes, we remove all dots, commas, etc. after splitting string to words by default.
|
41
|
+
Because different punctiation does not mean difference in general. If you not agree you can turn this default off.
|
42
|
+
|
43
|
+
==Installation
|
44
|
+
|
45
|
+
As usual:
|
46
|
+
|
47
|
+
gem install simhash
|
48
|
+
|
49
|
+
But if you have {GNU MP library}[http://gmplib.org/], simhash will work faster! To check out which version is used, type:
|
50
|
+
|
51
|
+
Simhash::DEFAULT_STRING_HASH_METHOD
|
52
|
+
|
53
|
+
It should return symbol. If symbol ends with "rb", your simhash is slow. If you want make it faster, install GNU MP.
|
data/Rakefile
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rake'
|
2
2
|
require 'rake/testtask'
|
3
3
|
require 'rake/rdoctask'
|
4
|
+
require 'rake/extensiontask'
|
4
5
|
|
5
6
|
$LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
|
6
7
|
require 'simhash'
|
@@ -8,6 +9,8 @@ require 'simhash'
|
|
8
9
|
desc 'Default: run unit tests.'
|
9
10
|
task :default => [:test]
|
10
11
|
|
12
|
+
Rake::ExtensionTask.new('string_hashing')
|
13
|
+
|
11
14
|
desc 'Test the simhash gem'
|
12
15
|
Rake::TestTask.new(:test) do |t|
|
13
16
|
t.libs << 'lib'
|
@@ -18,7 +21,7 @@ end
|
|
18
21
|
desc 'Start an IRB session with all necessary files required.'
|
19
22
|
task :shell do |t|
|
20
23
|
chdir File.dirname(__FILE__)
|
21
|
-
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems
|
24
|
+
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
|
22
25
|
end
|
23
26
|
|
24
27
|
desc 'Build the gemspec.'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
extension_name = 'string_hashing'
|
4
|
+
# should link against the libgmp library
|
5
|
+
$LDFLAGS << ' -lgmp'
|
6
|
+
|
7
|
+
# Sort out the universal vs. single-archicture build problems on MacOS X
|
8
|
+
if RUBY_PLATFORM.include?( 'darwin' )
|
9
|
+
puts "MacOS X build: fixing architecture flags:"
|
10
|
+
|
11
|
+
commonflags = nil
|
12
|
+
if ENV['ARCHFLAGS']
|
13
|
+
puts " using the value in ARCHFLAGS environment variable (%p)." % [ ENV['ARCHFLAGS'] ]
|
14
|
+
commonflags = ENV['ARCHFLAGS']
|
15
|
+
else
|
16
|
+
$stderr.puts %{
|
17
|
+
=========== WARNING ===========
|
18
|
+
|
19
|
+
You are building this extension on OS X without setting the
|
20
|
+
ARCHFLAGS environment variable.
|
21
|
+
|
22
|
+
If you are seeing this message, that means that the
|
23
|
+
build will probably fail.
|
24
|
+
|
25
|
+
===================================
|
26
|
+
}.gsub( /^\t+/, ' ' )
|
27
|
+
end
|
28
|
+
|
29
|
+
if commonflags
|
30
|
+
$CFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
31
|
+
$LDFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
32
|
+
CONFIG['LDSHARED'].gsub!( /-arch\s+\S+ /, '' )
|
33
|
+
|
34
|
+
$CFLAGS << ' ' << commonflags
|
35
|
+
$LDFLAGS << ' ' << commonflags
|
36
|
+
CONFIG['LDSHARED'] << ' ' << commonflags
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if find_header("gmp.h")
|
41
|
+
$stderr.puts "Configuring extensions"
|
42
|
+
dir_config(extension_name)
|
43
|
+
create_makefile(extension_name)
|
44
|
+
else
|
45
|
+
$stderr.puts "Skipping building of C extension"
|
46
|
+
# creating foo Makefile to avoid building stuff
|
47
|
+
File.open(File.join(File.dirname(__FILE__), "Makefile"), "w"){|f| f.write("all: \ninstall: \n")}
|
48
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <gmp.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
|
5
|
+
VALUE StringHashing = Qnil;
|
6
|
+
|
7
|
+
void Init_string_hashing();
|
8
|
+
|
9
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength);
|
10
|
+
|
11
|
+
void Init_string_hashing() {
|
12
|
+
rb_define_method(rb_cString, "hash_vl", method_hash_vl, 1);
|
13
|
+
}
|
14
|
+
|
15
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength) {
|
16
|
+
int bl = NUM2INT(bitlength);
|
17
|
+
|
18
|
+
// for hard typecasting
|
19
|
+
unsigned char one_char;
|
20
|
+
char* result;
|
21
|
+
result = malloc(bl*sizeof(char));
|
22
|
+
unsigned long long len = RSTRING(self)->len;
|
23
|
+
char *string = RSTRING(self)->ptr;
|
24
|
+
|
25
|
+
if(len == 0){ return 0; }
|
26
|
+
|
27
|
+
mpz_t x, mask, long_len;
|
28
|
+
mpz_init_set_ui (long_len, len);
|
29
|
+
one_char = RSTRING(self)->ptr[0];
|
30
|
+
mpz_init_set_ui (x, one_char << 7);
|
31
|
+
int m = 1000003;
|
32
|
+
|
33
|
+
// generating mask of length bitlength filled with 1
|
34
|
+
mpz_init (mask);
|
35
|
+
mpz_ui_pow_ui(mask, 2, bl);
|
36
|
+
mpz_sub_ui (mask, mask, 1);
|
37
|
+
|
38
|
+
mpz_t computations, byte;
|
39
|
+
mpz_init(computations);
|
40
|
+
mpz_init2 (byte, 8);
|
41
|
+
|
42
|
+
int i = 0;
|
43
|
+
for(i; i < len; i++) {
|
44
|
+
one_char = string[i];
|
45
|
+
mpz_set_ui(byte, one_char);
|
46
|
+
mpz_mul_ui(computations, x, m);
|
47
|
+
mpz_xor(computations, computations, byte);
|
48
|
+
mpz_and (x, mask, computations);
|
49
|
+
}
|
50
|
+
|
51
|
+
mpz_xor(x, x, long_len);
|
52
|
+
mpz_get_str (result, 10, x);
|
53
|
+
VALUE res = rb_str_new2(result);
|
54
|
+
|
55
|
+
mpz_clear(x);
|
56
|
+
mpz_clear(byte);
|
57
|
+
mpz_clear(computations);
|
58
|
+
mpz_clear(mask);
|
59
|
+
mpz_clear(long_len);
|
60
|
+
free(result);
|
61
|
+
|
62
|
+
return res;
|
63
|
+
}
|
data/lib/simhash.rb
CHANGED
@@ -1,11 +1,21 @@
|
|
1
1
|
$KCODE = 'u'
|
2
|
-
require '
|
3
|
-
require
|
2
|
+
require 'unicode'
|
3
|
+
require 'string'
|
4
|
+
require 'integer'
|
5
|
+
require 'simhash/stopwords'
|
6
|
+
begin
|
7
|
+
require 'string_hashing'
|
8
|
+
rescue LoadError
|
9
|
+
end
|
10
|
+
|
4
11
|
|
5
12
|
module Simhash
|
13
|
+
DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
|
14
|
+
|
6
15
|
def self.hash(tokens, options={})
|
7
16
|
hashbits = options[:hashbits] || 64
|
8
17
|
token_min_size = options[:token_min_size].to_i
|
18
|
+
hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
|
9
19
|
|
10
20
|
v = [0] * hashbits
|
11
21
|
masks = v.dup
|
@@ -15,15 +25,13 @@ module Simhash
|
|
15
25
|
# cutting punctuation (\302\240 is unbreakable space)
|
16
26
|
token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
|
17
27
|
|
18
|
-
token = token.strip
|
28
|
+
token = Unicode::downcase(token.strip)
|
19
29
|
|
20
30
|
# cutting stop-words
|
21
31
|
token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
|
22
32
|
|
23
33
|
next if token.size.zero? || token.size < token_min_size
|
24
|
-
|
25
|
-
hashed_token = token.hash_wl(hashbits)
|
26
|
-
bitmask = 0
|
34
|
+
hashed_token = token.send(hashing_method, hashbits).to_i
|
27
35
|
hashbits.times do |i|
|
28
36
|
v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
|
29
37
|
end
|
@@ -35,4 +43,8 @@ module Simhash
|
|
35
43
|
|
36
44
|
fingerprint
|
37
45
|
end
|
46
|
+
|
47
|
+
def self.hm
|
48
|
+
@@string_hash_method
|
49
|
+
end
|
38
50
|
end
|
data/lib/string.rb
CHANGED
@@ -3,19 +3,18 @@ class String
|
|
3
3
|
split_by = options.delete(:split_by) || " "
|
4
4
|
Simhash.hash(self.split(split_by), options)
|
5
5
|
end
|
6
|
-
|
7
6
|
|
8
|
-
|
9
|
-
def hash_wl(length)
|
7
|
+
def hash_vl_rb(length)
|
10
8
|
return 0 if self == ""
|
11
9
|
|
12
10
|
x = self[0] << 7
|
13
11
|
m = 1000003
|
14
12
|
mask = (1<<length) - 1
|
15
13
|
self.each_byte{ |char| x = ((x * m) ^ char) & mask }
|
16
|
-
|
14
|
+
|
17
15
|
x ^= self.size
|
18
16
|
x = -2 if x == -1
|
19
17
|
x
|
20
18
|
end
|
19
|
+
|
21
20
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhash
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alex Gusev
|
@@ -15,30 +15,45 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-20 00:00:00 +04:00
|
19
19
|
default_executable:
|
20
|
-
dependencies:
|
21
|
-
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: unicode
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 17
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 3
|
33
|
+
- 1
|
34
|
+
version: 0.3.1
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
22
37
|
description: Implementation of Charikar simhashes in Ruby
|
23
38
|
email: alex.gusev@bookmate.ru
|
24
39
|
executables: []
|
25
40
|
|
26
|
-
extensions:
|
27
|
-
|
41
|
+
extensions:
|
42
|
+
- ext/string_hashing/extconf.rb
|
28
43
|
extra_rdoc_files: []
|
29
44
|
|
30
45
|
files:
|
31
|
-
- README
|
46
|
+
- README.rdoc
|
32
47
|
- LICENSE
|
33
48
|
- Rakefile
|
34
|
-
- init.rb
|
35
49
|
- lib/integer.rb
|
36
50
|
- lib/simhash/stopwords/en.rb
|
37
51
|
- lib/simhash/stopwords/ru.rb
|
38
52
|
- lib/simhash/stopwords.rb
|
39
53
|
- lib/simhash.rb
|
40
54
|
- lib/string.rb
|
41
|
-
-
|
55
|
+
- ext/string_hashing/extconf.rb
|
56
|
+
- ext/string_hashing/string_hashing.c
|
42
57
|
has_rdoc: true
|
43
58
|
homepage: http://github.com/bookmate/simhash
|
44
59
|
licenses: []
|
data/README
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Implementation of Charikar simhashes in Ruby
|
data/init.rb
DELETED
data/rails/init.rb
DELETED