simhash 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,53 @@
1
+ ==Absctract
2
+
3
+ This is implementation of {Moses Charikar's simhashes}[http://portal.acm.org/citation.cfm?id=509965] in Ruby.
4
+
5
+ ==Usage
6
+
7
+ When you have a string and want to calculate it's simhash, you should
8
+
9
+ my_string.simhash
10
+
11
+ By default it will generate 64-bit integer - that is simhash for this string
12
+
13
+ It's always better to tokenize string before simhashing. It's as simple as
14
+
15
+ my_string.simhash(:split_by => / /)
16
+
17
+ This will generate 64-bit integer based, but will split string into words before.
18
+ It's handy when you need to calculate similarity of strings based on word usage.
19
+ You can split string as you like: by letters/sentences/specific letter-combinations, etc.
20
+
21
+ my_string.simhash(:split_by => /./, :bitlength => 512)
22
+
23
+ Sometimes you might need longer simhash (finding similarity for very long strings is a good example).
24
+ You can set length of result hash by passing bitlength parameter. This example will return 512-bit simhash
25
+ for your string splitted by sentences.
26
+
27
+ ==Advanced usage
28
+
29
+ It's useful to clean your string before simhashing. But it's useful not to clean, too.
30
+
31
+ Here are examples:
32
+
33
+ my_string.simhash(:stop_words => true) # here we clean
34
+
35
+ This will find stop-words in your string and remove them before simhashing. Stop-words are "the", "not", "about", etc.
36
+ Currently we remove only Russian and English stop-words.
37
+
38
+ my_string.simhash(:preserve_punctuation => true) # here we not
39
+
40
+ This will not remove punctuation before simhashing. Yes, we remove all dots, commas, etc. after splitting string to words by default.
41
+ Because different punctiation does not mean difference in general. If you not agree you can turn this default off.
42
+
43
+ ==Installation
44
+
45
+ As usual:
46
+
47
+ gem install simhash
48
+
49
+ But if you have {GNU MP library}[http://gmplib.org/], simhash will work faster! To check out which version is used, type:
50
+
51
+ Simhash::DEFAULT_STRING_HASH_METHOD
52
+
53
+ It should return symbol. If symbol ends with "rb", your simhash is slow. If you want make it faster, install GNU MP.
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'rake'
2
2
  require 'rake/testtask'
3
3
  require 'rake/rdoctask'
4
+ require 'rake/extensiontask'
4
5
 
5
6
  $LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
6
7
  require 'simhash'
@@ -8,6 +9,8 @@ require 'simhash'
8
9
  desc 'Default: run unit tests.'
9
10
  task :default => [:test]
10
11
 
12
+ Rake::ExtensionTask.new('string_hashing')
13
+
11
14
  desc 'Test the simhash gem'
12
15
  Rake::TestTask.new(:test) do |t|
13
16
  t.libs << 'lib'
@@ -18,7 +21,7 @@ end
18
21
  desc 'Start an IRB session with all necessary files required.'
19
22
  task :shell do |t|
20
23
  chdir File.dirname(__FILE__)
21
- exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems -r init'
24
+ exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
22
25
  end
23
26
 
24
27
  desc 'Build the gemspec.'
@@ -0,0 +1,48 @@
1
+ require 'mkmf'
2
+
3
+ extension_name = 'string_hashing'
4
+ # should link against the libgmp library
5
+ $LDFLAGS << ' -lgmp'
6
+
7
+ # Sort out the universal vs. single-archicture build problems on MacOS X
8
+ if RUBY_PLATFORM.include?( 'darwin' )
9
+ puts "MacOS X build: fixing architecture flags:"
10
+
11
+ commonflags = nil
12
+ if ENV['ARCHFLAGS']
13
+ puts " using the value in ARCHFLAGS environment variable (%p)." % [ ENV['ARCHFLAGS'] ]
14
+ commonflags = ENV['ARCHFLAGS']
15
+ else
16
+ $stderr.puts %{
17
+ =========== WARNING ===========
18
+
19
+ You are building this extension on OS X without setting the
20
+ ARCHFLAGS environment variable.
21
+
22
+ If you are seeing this message, that means that the
23
+ build will probably fail.
24
+
25
+ ===================================
26
+ }.gsub( /^\t+/, ' ' )
27
+ end
28
+
29
+ if commonflags
30
+ $CFLAGS.gsub!( /-arch\s+\S+ /, '' )
31
+ $LDFLAGS.gsub!( /-arch\s+\S+ /, '' )
32
+ CONFIG['LDSHARED'].gsub!( /-arch\s+\S+ /, '' )
33
+
34
+ $CFLAGS << ' ' << commonflags
35
+ $LDFLAGS << ' ' << commonflags
36
+ CONFIG['LDSHARED'] << ' ' << commonflags
37
+ end
38
+ end
39
+
40
+ if find_header("gmp.h")
41
+ $stderr.puts "Configuring extensions"
42
+ dir_config(extension_name)
43
+ create_makefile(extension_name)
44
+ else
45
+ $stderr.puts "Skipping building of C extension"
46
+ # creating foo Makefile to avoid building stuff
47
+ File.open(File.join(File.dirname(__FILE__), "Makefile"), "w"){|f| f.write("all: \ninstall: \n")}
48
+ end
@@ -0,0 +1,63 @@
1
+ #include "ruby.h"
2
+ #include <gmp.h>
3
+ #include <stdio.h>
4
+
5
+ VALUE StringHashing = Qnil;
6
+
7
+ void Init_string_hashing();
8
+
9
+ VALUE method_hash_vl(VALUE self, VALUE bitlength);
10
+
11
+ void Init_string_hashing() {
12
+ rb_define_method(rb_cString, "hash_vl", method_hash_vl, 1);
13
+ }
14
+
15
+ VALUE method_hash_vl(VALUE self, VALUE bitlength) {
16
+ int bl = NUM2INT(bitlength);
17
+
18
+ // for hard typecasting
19
+ unsigned char one_char;
20
+ char* result;
21
+ result = malloc(bl*sizeof(char));
22
+ unsigned long long len = RSTRING(self)->len;
23
+ char *string = RSTRING(self)->ptr;
24
+
25
+ if(len == 0){ return 0; }
26
+
27
+ mpz_t x, mask, long_len;
28
+ mpz_init_set_ui (long_len, len);
29
+ one_char = RSTRING(self)->ptr[0];
30
+ mpz_init_set_ui (x, one_char << 7);
31
+ int m = 1000003;
32
+
33
+ // generating mask of length bitlength filled with 1
34
+ mpz_init (mask);
35
+ mpz_ui_pow_ui(mask, 2, bl);
36
+ mpz_sub_ui (mask, mask, 1);
37
+
38
+ mpz_t computations, byte;
39
+ mpz_init(computations);
40
+ mpz_init2 (byte, 8);
41
+
42
+ int i = 0;
43
+ for(i; i < len; i++) {
44
+ one_char = string[i];
45
+ mpz_set_ui(byte, one_char);
46
+ mpz_mul_ui(computations, x, m);
47
+ mpz_xor(computations, computations, byte);
48
+ mpz_and (x, mask, computations);
49
+ }
50
+
51
+ mpz_xor(x, x, long_len);
52
+ mpz_get_str (result, 10, x);
53
+ VALUE res = rb_str_new2(result);
54
+
55
+ mpz_clear(x);
56
+ mpz_clear(byte);
57
+ mpz_clear(computations);
58
+ mpz_clear(mask);
59
+ mpz_clear(long_len);
60
+ free(result);
61
+
62
+ return res;
63
+ }
data/lib/simhash.rb CHANGED
@@ -1,11 +1,21 @@
1
1
  $KCODE = 'u'
2
- require 'active_support/core_ext/string/multibyte'
3
- require File.join(File.dirname(__FILE__), "simhash", "stopwords")
2
+ require 'unicode'
3
+ require 'string'
4
+ require 'integer'
5
+ require 'simhash/stopwords'
6
+ begin
7
+ require 'string_hashing'
8
+ rescue LoadError
9
+ end
10
+
4
11
 
5
12
  module Simhash
13
+ DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
14
+
6
15
  def self.hash(tokens, options={})
7
16
  hashbits = options[:hashbits] || 64
8
17
  token_min_size = options[:token_min_size].to_i
18
+ hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
9
19
 
10
20
  v = [0] * hashbits
11
21
  masks = v.dup
@@ -15,15 +25,13 @@ module Simhash
15
25
  # cutting punctuation (\302\240 is unbreakable space)
16
26
  token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
17
27
 
18
- token = token.strip.mb_chars.downcase
28
+ token = Unicode::downcase(token.strip)
19
29
 
20
30
  # cutting stop-words
21
31
  token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
22
32
 
23
33
  next if token.size.zero? || token.size < token_min_size
24
-
25
- hashed_token = token.hash_wl(hashbits)
26
- bitmask = 0
34
+ hashed_token = token.send(hashing_method, hashbits).to_i
27
35
  hashbits.times do |i|
28
36
  v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
29
37
  end
@@ -35,4 +43,8 @@ module Simhash
35
43
 
36
44
  fingerprint
37
45
  end
46
+
47
+ def self.hm
48
+ @@string_hash_method
49
+ end
38
50
  end
data/lib/string.rb CHANGED
@@ -3,19 +3,18 @@ class String
3
3
  split_by = options.delete(:split_by) || " "
4
4
  Simhash.hash(self.split(split_by), options)
5
5
  end
6
-
7
6
 
8
- # string hash of predefined length
9
- def hash_wl(length)
7
+ def hash_vl_rb(length)
10
8
  return 0 if self == ""
11
9
 
12
10
  x = self[0] << 7
13
11
  m = 1000003
14
12
  mask = (1<<length) - 1
15
13
  self.each_byte{ |char| x = ((x * m) ^ char) & mask }
16
-
14
+
17
15
  x ^= self.size
18
16
  x = -2 if x == -1
19
17
  x
20
18
  end
19
+
21
20
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,30 +15,45 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-17 00:00:00 +04:00
18
+ date: 2010-08-20 00:00:00 +04:00
19
19
  default_executable:
20
- dependencies: []
21
-
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: unicode
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 17
30
+ segments:
31
+ - 0
32
+ - 3
33
+ - 1
34
+ version: 0.3.1
35
+ type: :runtime
36
+ version_requirements: *id001
22
37
  description: Implementation of Charikar simhashes in Ruby
23
38
  email: alex.gusev@bookmate.ru
24
39
  executables: []
25
40
 
26
- extensions: []
27
-
41
+ extensions:
42
+ - ext/string_hashing/extconf.rb
28
43
  extra_rdoc_files: []
29
44
 
30
45
  files:
31
- - README
46
+ - README.rdoc
32
47
  - LICENSE
33
48
  - Rakefile
34
- - init.rb
35
49
  - lib/integer.rb
36
50
  - lib/simhash/stopwords/en.rb
37
51
  - lib/simhash/stopwords/ru.rb
38
52
  - lib/simhash/stopwords.rb
39
53
  - lib/simhash.rb
40
54
  - lib/string.rb
41
- - rails/init.rb
55
+ - ext/string_hashing/extconf.rb
56
+ - ext/string_hashing/string_hashing.c
42
57
  has_rdoc: true
43
58
  homepage: http://github.com/bookmate/simhash
44
59
  licenses: []
data/README DELETED
@@ -1 +0,0 @@
1
- Implementation of Charikar simhashes in Ruby
data/init.rb DELETED
@@ -1,3 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "lib", "simhash")
2
- require File.join(File.dirname(__FILE__), "lib", "string")
3
- require File.join(File.dirname(__FILE__), "lib", "integer")
data/rails/init.rb DELETED
@@ -1,3 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "lib", "simhash")
2
- require File.join(File.dirname(__FILE__), "..", "lib", "string")
3
- require File.join(File.dirname(__FILE__), "..", "lib", "integer")