simhash 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/simhash.rb +9 -6
  2. data/lib/string.rb +3 -1
  3. metadata +4 -4
data/lib/simhash.rb CHANGED
@@ -1,5 +1,3 @@
1
- #require 'rubygems'
2
- #require 'lingua/stemmer'
3
1
  $KCODE = 'u'
4
2
  require 'active_support/core_ext/string/multibyte'
5
3
  require File.join(File.dirname(__FILE__), "simhash", "stopwords")
@@ -7,18 +5,23 @@ require File.join(File.dirname(__FILE__), "simhash", "stopwords")
7
5
  module Simhash
8
6
  def self.hash(tokens, options={})
9
7
  hashbits = options[:hashbits] || 64
8
+ token_min_size = options[:token_min_size].to_i
10
9
 
11
10
  v = [0] * hashbits
12
11
  masks = v.dup
13
12
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
14
13
 
15
14
  tokens.each do |token|
16
- token = token.gsub(/(\s|\d|\W)+/u,' ').strip
15
+ # cutting punctuation (\302\240 is unbreakable space)
16
+ token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
17
17
 
18
- next if token.size < 2
19
- next if options[:stop_words] && Stopwords::ALL.index(" #{token.strip.mb_chars.downcase} ") != nil
18
+ token = token.strip.mb_chars.downcase
20
19
 
21
- #token = Lingua.stemmer(token, :language => :ru)
20
+ # cutting stop-words
21
+ token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
22
+
23
+ next if token.size.zero? || token.size < token_min_size
24
+
22
25
  hashed_token = token.hash_wl(hashbits)
23
26
  bitmask = 0
24
27
  hashbits.times do |i|
data/lib/string.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  class String
2
2
  def simhash(options={})
3
- Simhash.hash(self.split, options)
3
+ split_by = options.delete(:split_by) || " "
4
+ Simhash.hash(self.split(split_by), options)
4
5
  end
6
+
5
7
 
6
8
  # string hash of predefined length
7
9
  def hash_wl(length)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 1
8
9
  - 0
9
- - 4
10
- version: 0.0.4
10
+ version: 0.1.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-13 00:00:00 +04:00
18
+ date: 2010-08-17 00:00:00 +04:00
19
19
  default_executable:
20
20
  dependencies: []
21
21