simhash 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/simhash.rb +9 -6
  2. data/lib/string.rb +3 -1
  3. metadata +4 -4
data/lib/simhash.rb CHANGED
@@ -1,5 +1,3 @@
1
- #require 'rubygems'
2
- #require 'lingua/stemmer'
3
1
  $KCODE = 'u'
4
2
  require 'active_support/core_ext/string/multibyte'
5
3
  require File.join(File.dirname(__FILE__), "simhash", "stopwords")
@@ -7,18 +5,23 @@ require File.join(File.dirname(__FILE__), "simhash", "stopwords")
7
5
  module Simhash
8
6
  def self.hash(tokens, options={})
9
7
  hashbits = options[:hashbits] || 64
8
+ token_min_size = options[:token_min_size].to_i
10
9
 
11
10
  v = [0] * hashbits
12
11
  masks = v.dup
13
12
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
14
13
 
15
14
  tokens.each do |token|
16
- token = token.gsub(/(\s|\d|\W)+/u,' ').strip
15
+ # cutting punctuation (\302\240 is unbreakable space)
16
+ token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
17
17
 
18
- next if token.size < 2
19
- next if options[:stop_words] && Stopwords::ALL.index(" #{token.strip.mb_chars.downcase} ") != nil
18
+ token = token.strip.mb_chars.downcase
20
19
 
21
- #token = Lingua.stemmer(token, :language => :ru)
20
+ # cutting stop-words
21
+ token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
22
+
23
+ next if token.size.zero? || token.size < token_min_size
24
+
22
25
  hashed_token = token.hash_wl(hashbits)
23
26
  bitmask = 0
24
27
  hashbits.times do |i|
data/lib/string.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  class String
2
2
  def simhash(options={})
3
- Simhash.hash(self.split, options)
3
+ split_by = options.delete(:split_by) || " "
4
+ Simhash.hash(self.split(split_by), options)
4
5
  end
6
+
5
7
 
6
8
  # string hash of predefined length
7
9
  def hash_wl(length)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 1
8
9
  - 0
9
- - 4
10
- version: 0.0.4
10
+ version: 0.1.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-13 00:00:00 +04:00
18
+ date: 2010-08-17 00:00:00 +04:00
19
19
  default_executable:
20
20
  dependencies: []
21
21