simhash 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,53 @@
1
+ ==Absctract
2
+
3
+ This is implementation of {Moses Charikar's simhashes}[http://portal.acm.org/citation.cfm?id=509965] in Ruby.
4
+
5
+ ==Usage
6
+
7
+ When you have a string and want to calculate it's simhash, you should
8
+
9
+ my_string.simhash
10
+
11
+ By default it will generate 64-bit integer - that is simhash for this string
12
+
13
+ It's always better to tokenize string before simhashing. It's as simple as
14
+
15
+ my_string.simhash(:split_by => / /)
16
+
17
+ This will generate 64-bit integer based, but will split string into words before.
18
+ It's handy when you need to calculate similarity of strings based on word usage.
19
+ You can split string as you like: by letters/sentences/specific letter-combinations, etc.
20
+
21
+ my_string.simhash(:split_by => /./, :bitlength => 512)
22
+
23
+ Sometimes you might need longer simhash (finding similarity for very long strings is a good example).
24
+ You can set length of result hash by passing bitlength parameter. This example will return 512-bit simhash
25
+ for your string splitted by sentences.
26
+
27
+ ==Advanced usage
28
+
29
+ It's useful to clean your string before simhashing. But it's useful not to clean, too.
30
+
31
+ Here are examples:
32
+
33
+ my_string.simhash(:stop_words => true) # here we clean
34
+
35
+ This will find stop-words in your string and remove them before simhashing. Stop-words are "the", "not", "about", etc.
36
+ Currently we remove only Russian and English stop-words.
37
+
38
+ my_string.simhash(:preserve_punctuation => true) # here we not
39
+
40
+ This will not remove punctuation before simhashing. Yes, we remove all dots, commas, etc. after splitting string to words by default.
41
+ Because different punctiation does not mean difference in general. If you not agree you can turn this default off.
42
+
43
+ ==Installation
44
+
45
+ As usual:
46
+
47
+ gem install simhash
48
+
49
+ But if you have {GNU MP library}[http://gmplib.org/], simhash will work faster! To check out which version is used, type:
50
+
51
+ Simhash::DEFAULT_STRING_HASH_METHOD
52
+
53
+ It should return symbol. If symbol ends with "rb", your simhash is slow. If you want make it faster, install GNU MP.
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'rake'
2
2
  require 'rake/testtask'
3
3
  require 'rake/rdoctask'
4
+ require 'rake/extensiontask'
4
5
 
5
6
  $LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
6
7
  require 'simhash'
@@ -8,6 +9,8 @@ require 'simhash'
8
9
  desc 'Default: run unit tests.'
9
10
  task :default => [:test]
10
11
 
12
+ Rake::ExtensionTask.new('string_hashing')
13
+
11
14
  desc 'Test the simhash gem'
12
15
  Rake::TestTask.new(:test) do |t|
13
16
  t.libs << 'lib'
@@ -18,7 +21,7 @@ end
18
21
  desc 'Start an IRB session with all necessary files required.'
19
22
  task :shell do |t|
20
23
  chdir File.dirname(__FILE__)
21
- exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems -r init'
24
+ exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
22
25
  end
23
26
 
24
27
  desc 'Build the gemspec.'
@@ -0,0 +1,48 @@
1
+ require 'mkmf'
2
+
3
+ extension_name = 'string_hashing'
4
+ # should link against the libgmp library
5
+ $LDFLAGS << ' -lgmp'
6
+
7
+ # Sort out the universal vs. single-archicture build problems on MacOS X
8
+ if RUBY_PLATFORM.include?( 'darwin' )
9
+ puts "MacOS X build: fixing architecture flags:"
10
+
11
+ commonflags = nil
12
+ if ENV['ARCHFLAGS']
13
+ puts " using the value in ARCHFLAGS environment variable (%p)." % [ ENV['ARCHFLAGS'] ]
14
+ commonflags = ENV['ARCHFLAGS']
15
+ else
16
+ $stderr.puts %{
17
+ =========== WARNING ===========
18
+
19
+ You are building this extension on OS X without setting the
20
+ ARCHFLAGS environment variable.
21
+
22
+ If you are seeing this message, that means that the
23
+ build will probably fail.
24
+
25
+ ===================================
26
+ }.gsub( /^\t+/, ' ' )
27
+ end
28
+
29
+ if commonflags
30
+ $CFLAGS.gsub!( /-arch\s+\S+ /, '' )
31
+ $LDFLAGS.gsub!( /-arch\s+\S+ /, '' )
32
+ CONFIG['LDSHARED'].gsub!( /-arch\s+\S+ /, '' )
33
+
34
+ $CFLAGS << ' ' << commonflags
35
+ $LDFLAGS << ' ' << commonflags
36
+ CONFIG['LDSHARED'] << ' ' << commonflags
37
+ end
38
+ end
39
+
40
+ if find_header("gmp.h")
41
+ $stderr.puts "Configuring extensions"
42
+ dir_config(extension_name)
43
+ create_makefile(extension_name)
44
+ else
45
+ $stderr.puts "Skipping building of C extension"
46
+ # creating foo Makefile to avoid building stuff
47
+ File.open(File.join(File.dirname(__FILE__), "Makefile"), "w"){|f| f.write("all: \ninstall: \n")}
48
+ end
@@ -0,0 +1,63 @@
1
+ #include "ruby.h"
2
+ #include <gmp.h>
3
+ #include <stdio.h>
4
+
5
+ VALUE StringHashing = Qnil;
6
+
7
+ void Init_string_hashing();
8
+
9
+ VALUE method_hash_vl(VALUE self, VALUE bitlength);
10
+
11
+ void Init_string_hashing() {
12
+ rb_define_method(rb_cString, "hash_vl", method_hash_vl, 1);
13
+ }
14
+
15
+ VALUE method_hash_vl(VALUE self, VALUE bitlength) {
16
+ int bl = NUM2INT(bitlength);
17
+
18
+ // for hard typecasting
19
+ unsigned char one_char;
20
+ char* result;
21
+ result = malloc(bl*sizeof(char));
22
+ unsigned long long len = RSTRING(self)->len;
23
+ char *string = RSTRING(self)->ptr;
24
+
25
+ if(len == 0){ return 0; }
26
+
27
+ mpz_t x, mask, long_len;
28
+ mpz_init_set_ui (long_len, len);
29
+ one_char = RSTRING(self)->ptr[0];
30
+ mpz_init_set_ui (x, one_char << 7);
31
+ int m = 1000003;
32
+
33
+ // generating mask of length bitlength filled with 1
34
+ mpz_init (mask);
35
+ mpz_ui_pow_ui(mask, 2, bl);
36
+ mpz_sub_ui (mask, mask, 1);
37
+
38
+ mpz_t computations, byte;
39
+ mpz_init(computations);
40
+ mpz_init2 (byte, 8);
41
+
42
+ int i = 0;
43
+ for(i; i < len; i++) {
44
+ one_char = string[i];
45
+ mpz_set_ui(byte, one_char);
46
+ mpz_mul_ui(computations, x, m);
47
+ mpz_xor(computations, computations, byte);
48
+ mpz_and (x, mask, computations);
49
+ }
50
+
51
+ mpz_xor(x, x, long_len);
52
+ mpz_get_str (result, 10, x);
53
+ VALUE res = rb_str_new2(result);
54
+
55
+ mpz_clear(x);
56
+ mpz_clear(byte);
57
+ mpz_clear(computations);
58
+ mpz_clear(mask);
59
+ mpz_clear(long_len);
60
+ free(result);
61
+
62
+ return res;
63
+ }
data/lib/simhash.rb CHANGED
@@ -1,11 +1,21 @@
1
1
  $KCODE = 'u'
2
- require 'active_support/core_ext/string/multibyte'
3
- require File.join(File.dirname(__FILE__), "simhash", "stopwords")
2
+ require 'unicode'
3
+ require 'string'
4
+ require 'integer'
5
+ require 'simhash/stopwords'
6
+ begin
7
+ require 'string_hashing'
8
+ rescue LoadError
9
+ end
10
+
4
11
 
5
12
  module Simhash
13
+ DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
14
+
6
15
  def self.hash(tokens, options={})
7
16
  hashbits = options[:hashbits] || 64
8
17
  token_min_size = options[:token_min_size].to_i
18
+ hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
9
19
 
10
20
  v = [0] * hashbits
11
21
  masks = v.dup
@@ -15,15 +25,13 @@ module Simhash
15
25
  # cutting punctuation (\302\240 is unbreakable space)
16
26
  token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
17
27
 
18
- token = token.strip.mb_chars.downcase
28
+ token = Unicode::downcase(token.strip)
19
29
 
20
30
  # cutting stop-words
21
31
  token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
22
32
 
23
33
  next if token.size.zero? || token.size < token_min_size
24
-
25
- hashed_token = token.hash_wl(hashbits)
26
- bitmask = 0
34
+ hashed_token = token.send(hashing_method, hashbits).to_i
27
35
  hashbits.times do |i|
28
36
  v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
29
37
  end
@@ -35,4 +43,8 @@ module Simhash
35
43
 
36
44
  fingerprint
37
45
  end
46
+
47
+ def self.hm
48
+ @@string_hash_method
49
+ end
38
50
  end
data/lib/string.rb CHANGED
@@ -3,19 +3,18 @@ class String
3
3
  split_by = options.delete(:split_by) || " "
4
4
  Simhash.hash(self.split(split_by), options)
5
5
  end
6
-
7
6
 
8
- # string hash of predefined length
9
- def hash_wl(length)
7
+ def hash_vl_rb(length)
10
8
  return 0 if self == ""
11
9
 
12
10
  x = self[0] << 7
13
11
  m = 1000003
14
12
  mask = (1<<length) - 1
15
13
  self.each_byte{ |char| x = ((x * m) ^ char) & mask }
16
-
14
+
17
15
  x ^= self.size
18
16
  x = -2 if x == -1
19
17
  x
20
18
  end
19
+
21
20
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,30 +15,45 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-17 00:00:00 +04:00
18
+ date: 2010-08-20 00:00:00 +04:00
19
19
  default_executable:
20
- dependencies: []
21
-
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: unicode
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 17
30
+ segments:
31
+ - 0
32
+ - 3
33
+ - 1
34
+ version: 0.3.1
35
+ type: :runtime
36
+ version_requirements: *id001
22
37
  description: Implementation of Charikar simhashes in Ruby
23
38
  email: alex.gusev@bookmate.ru
24
39
  executables: []
25
40
 
26
- extensions: []
27
-
41
+ extensions:
42
+ - ext/string_hashing/extconf.rb
28
43
  extra_rdoc_files: []
29
44
 
30
45
  files:
31
- - README
46
+ - README.rdoc
32
47
  - LICENSE
33
48
  - Rakefile
34
- - init.rb
35
49
  - lib/integer.rb
36
50
  - lib/simhash/stopwords/en.rb
37
51
  - lib/simhash/stopwords/ru.rb
38
52
  - lib/simhash/stopwords.rb
39
53
  - lib/simhash.rb
40
54
  - lib/string.rb
41
- - rails/init.rb
55
+ - ext/string_hashing/extconf.rb
56
+ - ext/string_hashing/string_hashing.c
42
57
  has_rdoc: true
43
58
  homepage: http://github.com/bookmate/simhash
44
59
  licenses: []
data/README DELETED
@@ -1 +0,0 @@
1
- Implementation of Charikar simhashes in Ruby
data/init.rb DELETED
@@ -1,3 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "lib", "simhash")
2
- require File.join(File.dirname(__FILE__), "lib", "string")
3
- require File.join(File.dirname(__FILE__), "lib", "integer")
data/rails/init.rb DELETED
@@ -1,3 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "lib", "simhash")
2
- require File.join(File.dirname(__FILE__), "..", "lib", "string")
3
- require File.join(File.dirname(__FILE__), "..", "lib", "integer")