simhash2 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bd2ed19bec8ea85cee9f79596b1462883eca1df5
4
+ data.tar.gz: 642f9ce751132870bb0dfe14301fbe3199bd0bcd
5
+ SHA512:
6
+ metadata.gz: 34d638e74e3dade90dfa25e9d9d56e3b2574b155cd90123c5d359e73c83afa0270613c1325b5b9cf198e5c79b0063b319ce9c6d55df6715e954834d1abd8afb0
7
+ data.tar.gz: eb0302bc772a0d53ba5ce6099bc3cb40376e0dfc9c987ab95194888e6483afb66f146b518fce240e1c4b69ab05cfeff70893eef37c52fa474dc98e6c5b62868d
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.gitignore ADDED
@@ -0,0 +1,50 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format doc
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,6 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'spec/spec_helper.rb'
4
+
5
+ Metrics/LineLength:
6
+ Enabled: false
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'http://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in MetaInspector.gemspec
4
+ gemspec
5
+
6
+ gem 'coveralls', require: false
7
+ gem 'rake'
8
+ gem 'rspec'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jonathan Wong
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,6 @@
1
+ [![Gem Version](https://badge.fury.io/rb/simhash2.svg)](https://badge.fury.io/rb/simhash2)
2
+ [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash)
3
+ [![Build Status](https://travis-ci.org/armchairtheorist/simhash.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash)
4
+ [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash?branch=master)
5
+
6
+ # Simhash
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new :spec
4
+
5
+ task default: :spec
@@ -0,0 +1,3 @@
1
+ module Simhash
2
+ VERSION = '0.0.1'.freeze
3
+ end
data/lib/simhash.rb ADDED
@@ -0,0 +1,68 @@
1
+ require 'simhash/version'
2
+
3
+ module Simhash
4
+ extend self
5
+
6
+ HASHBITS = 64
7
+
8
+ OPTIONS = {
9
+ min_token_length: 1,
10
+ unique: false,
11
+ stemming: false,
12
+ stop_words: []
13
+
14
+ }.freeze
15
+
16
+ def generate(str, options = {})
17
+ generate_from_tokens(str.split(/\s+/), options)
18
+ end
19
+
20
+ def generate_from_tokens(tokens, options = {})
21
+ filter_tokens(tokens, OPTIONS.merge(options))
22
+
23
+ v = [0] * HASHBITS
24
+
25
+ masks = v.dup
26
+ masks.each_with_index { |_e, i| masks[i] = (1 << i) }
27
+
28
+ hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
29
+ hashes.each do |h|
30
+ HASHBITS.times do |i|
31
+ v[i] += (h & masks[i]).zero? ? -1 : +1
32
+ end
33
+ end
34
+
35
+ simhash = 0
36
+ HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
37
+
38
+ simhash
39
+ end
40
+
41
+ def hamming_distance(simhash1, simhash2)
42
+ (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
43
+ end
44
+
45
+ private
46
+
47
+ def simple_string_hash(str, length)
48
+ return 0 if str == ''
49
+
50
+ x = str.bytes.first << 7
51
+ m = 1_000_003
52
+ mask = (1 << length) - 1
53
+ str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
54
+
55
+ x ^= str.bytes.count
56
+ x = -2 if x == -1
57
+
58
+ x.to_i
59
+ end
60
+
61
+ def filter_tokens(tokens, options)
62
+ tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
63
+ tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
64
+ tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
65
+ tokens.map!(&:stem) if options[:stemming]
66
+ tokens.uniq! if options[:unique]
67
+ end
68
+ end
data/simhash2.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ require File.expand_path('../lib/simhash/version', __FILE__)
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'simhash2'
5
+ spec.version = Simhash::VERSION
6
+ spec.authors = ['Jonathan Wong']
7
+ spec.email = ['jonathan@armchairtheorist.com']
8
+ spec.summary = 'A rewrite of the \'simhash\' gem, which is an implementation of Moses Charikar\'s simhashes in Ruby.'
9
+ spec.homepage = 'http://github.com/armchairtheorist/simhash'
10
+ spec.license = 'MIT'
11
+
12
+ spec.files = `git ls-files`.split("\n")
13
+ spec.test_files = `git ls-files -- {spec}/*`.split("\n")
14
+ spec.require_paths = ['lib']
15
+
16
+ spec.add_development_dependency 'rspec', '~> 0'
17
+ spec.add_development_dependency 'rake', '~> 0'
18
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+ require 'fast-stemmer'
3
+
4
+ describe Simhash do
5
+ it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
+ str1 = 'I like going to the beach'
7
+ str2 = 'I like going to the beach'
8
+ str3 = 'I like going to the mall'
9
+
10
+ expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
+ end
13
+
14
+ it 'should strip punctuation and capitalization properly' do
15
+ str1 = "Hello, nurse! How's it going today... my man?"
16
+ str2 = 'hello nurse hows it going today my man'
17
+
18
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
19
+ end
20
+
21
+ it "should respect the 'unique' option" do
22
+ str1 = 'apple pear'
23
+ str2 = 'apple apple apple pear'
24
+
25
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
26
+ expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
27
+ end
28
+
29
+ it "should respect the 'stop_words' option" do
30
+ str1 = 'I like the man on the moon.'
31
+ str2 = 'like man moon'
32
+ stop_words = %w[i the on]
33
+
34
+ expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
35
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
36
+ end
37
+
38
+ it "should respect the 'stemming' option" do
39
+ str1 = 'My crazy cars have crazy minds!'
40
+ str2 = 'My crazi car have crazi mind!'
41
+
42
+ expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
43
+ expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
44
+ end
45
+
46
+ it 'should calculate hamming distances correctly' do
47
+ expect(Simhash.hamming_distance(2, 2)).to eq 0
48
+ expect(Simhash.hamming_distance(2, 3)).to eq 1
49
+ expect(Simhash.hamming_distance(255, 197)).to eq 4
50
+ end
51
+ end
@@ -0,0 +1,99 @@
1
+ require 'coveralls'
2
+ Coveralls.wear!
3
+
4
+ require 'simhash'
5
+
6
+ # This file was generated by the `rspec --init` command. Conventionally, all
7
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
9
+ # this file to always be loaded, without a need to explicitly require it in any
10
+ # files.
11
+ #
12
+ # Given that it is always loaded, you are encouraged to keep this file as
13
+ # light-weight as possible. Requiring heavyweight dependencies from this file
14
+ # will add to the boot time of your test suite on EVERY test run, even for an
15
+ # individual file that may not need all of that loaded. Instead, consider making
16
+ # a separate helper file that requires the additional dependencies and performs
17
+ # the additional setup, and require it from the spec files that actually need
18
+ # it.
19
+ #
20
+ # The `.rspec` file also contains a few flags that are not defaults but that
21
+ # users commonly want.
22
+ #
23
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
24
+ RSpec.configure do |config|
25
+ # rspec-expectations config goes here. You can use an alternate
26
+ # assertion/expectation library such as wrong or the stdlib/minitest
27
+ # assertions if you prefer.
28
+ config.expect_with :rspec do |expectations|
29
+ # This option will default to `true` in RSpec 4. It makes the `description`
30
+ # and `failure_message` of custom matchers include text for helper methods
31
+ # defined using `chain`, e.g.:
32
+ # be_bigger_than(2).and_smaller_than(4).description
33
+ # # => "be bigger than 2 and smaller than 4"
34
+ # ...rather than:
35
+ # # => "be bigger than 2"
36
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
37
+ end
38
+
39
+ # rspec-mocks config goes here. You can use an alternate test double
40
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
41
+ config.mock_with :rspec do |mocks|
42
+ # Prevents you from mocking or stubbing a method that does not exist on
43
+ # a real object. This is generally recommended, and will default to
44
+ # `true` in RSpec 4.
45
+ mocks.verify_partial_doubles = true
46
+ end
47
+
48
+ # The settings below are suggested to provide a good initial experience
49
+ # with RSpec, but feel free to customize to your heart's content.
50
+ # # These two settings work together to allow you to limit a spec run
51
+ # # to individual examples or groups you care about by tagging them with
52
+ # # `:focus` metadata. When nothing is tagged with `:focus`, all examples
53
+ # # get run.
54
+ # config.filter_run :focus
55
+ # config.run_all_when_everything_filtered = true
56
+ #
57
+ # # Allows RSpec to persist some state between runs in order to support
58
+ # # the `--only-failures` and `--next-failure` CLI options. We recommend
59
+ # # you configure your source control system to ignore this file.
60
+ # config.example_status_persistence_file_path = "spec/examples.txt"
61
+ #
62
+ # # Limits the available syntax to the non-monkey patched syntax that is
63
+ # # recommended. For more details, see:
64
+ # # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
65
+ # # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
66
+ # # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
67
+ # config.disable_monkey_patching!
68
+ #
69
+ # # This setting enables warnings. It's recommended, but in some cases may
70
+ # # be too noisy due to issues in dependencies.
71
+ # config.warnings = true
72
+ #
73
+ # # Many RSpec users commonly either run the entire suite or an individual
74
+ # # file, and it's useful to allow more verbose output when running an
75
+ # # individual spec file.
76
+ # if config.files_to_run.one?
77
+ # # Use the documentation formatter for detailed output,
78
+ # # unless a formatter has already been configured
79
+ # # (e.g. via a command-line flag).
80
+ # config.default_formatter = 'doc'
81
+ # end
82
+ #
83
+ # # Print the 10 slowest examples and example groups at the
84
+ # # end of the spec run, to help surface which specs are running
85
+ # # particularly slow.
86
+ # config.profile_examples = 10
87
+ #
88
+ # # Run specs in random order to surface order dependencies. If you find an
89
+ # # order dependency and want to debug it, you can fix the order by providing
90
+ # # the seed, which is printed after each run.
91
+ # # --seed 1234
92
+ # config.order = :random
93
+ #
94
+ # # Seed global randomization in this process using the `--seed` CLI option.
95
+ # # Setting this allows you to use `--seed` to deterministically reproduce
96
+ # # test failures related to randomization by passing the same `--seed` value
97
+ # # as the one that triggered the failure.
98
+ # Kernel.srand config.seed
99
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhash2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Wong
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description:
42
+ email:
43
+ - jonathan@armchairtheorist.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".coveralls.yml"
49
+ - ".gitignore"
50
+ - ".rspec"
51
+ - ".rubocop.yml"
52
+ - ".travis.yml"
53
+ - Gemfile
54
+ - LICENSE.txt
55
+ - README.md
56
+ - Rakefile
57
+ - lib/simhash.rb
58
+ - lib/simhash/version.rb
59
+ - simhash2.gemspec
60
+ - spec/simhash_spec.rb
61
+ - spec/spec_helper.rb
62
+ homepage: http://github.com/armchairtheorist/simhash
63
+ licenses:
64
+ - MIT
65
+ metadata: {}
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 2.6.11
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: A rewrite of the 'simhash' gem, which is an implementation of Moses Charikar's
86
+ simhashes in Ruby.
87
+ test_files: []