simhash2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bd2ed19bec8ea85cee9f79596b1462883eca1df5
4
+ data.tar.gz: 642f9ce751132870bb0dfe14301fbe3199bd0bcd
5
+ SHA512:
6
+ metadata.gz: 34d638e74e3dade90dfa25e9d9d56e3b2574b155cd90123c5d359e73c83afa0270613c1325b5b9cf198e5c79b0063b319ce9c6d55df6715e954834d1abd8afb0
7
+ data.tar.gz: eb0302bc772a0d53ba5ce6099bc3cb40376e0dfc9c987ab95194888e6483afb66f146b518fce240e1c4b69ab05cfeff70893eef37c52fa474dc98e6c5b62868d
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.gitignore ADDED
@@ -0,0 +1,50 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format doc
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,6 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'spec/spec_helper.rb'
4
+
5
+ Metrics/LineLength:
6
+ Enabled: false
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'http://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in MetaInspector.gemspec
4
+ gemspec
5
+
6
+ gem 'coveralls', require: false
7
+ gem 'rake'
8
+ gem 'rspec'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jonathan Wong
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,6 @@
1
+ [![Gem Version](https://badge.fury.io/rb/simhash2.svg)](https://badge.fury.io/rb/simhash2)
2
+ [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash)
3
+ [![Build Status](https://travis-ci.org/armchairtheorist/simhash.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash)
4
+ [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash?branch=master)
5
+
6
+ # Simhash
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new :spec
4
+
5
+ task default: :spec
@@ -0,0 +1,3 @@
1
+ module Simhash
2
+ VERSION = '0.0.1'.freeze
3
+ end
data/lib/simhash.rb ADDED
@@ -0,0 +1,68 @@
1
+ require 'simhash/version'
2
+
3
+ module Simhash
4
+ extend self
5
+
6
+ HASHBITS = 64
7
+
8
+ OPTIONS = {
9
+ min_token_length: 1,
10
+ unique: false,
11
+ stemming: false,
12
+ stop_words: []
13
+
14
+ }.freeze
15
+
16
+ def generate(str, options = {})
17
+ generate_from_tokens(str.split(/\s+/), options)
18
+ end
19
+
20
+ def generate_from_tokens(tokens, options = {})
21
+ filter_tokens(tokens, OPTIONS.merge(options))
22
+
23
+ v = [0] * HASHBITS
24
+
25
+ masks = v.dup
26
+ masks.each_with_index { |_e, i| masks[i] = (1 << i) }
27
+
28
+ hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
29
+ hashes.each do |h|
30
+ HASHBITS.times do |i|
31
+ v[i] += (h & masks[i]).zero? ? -1 : +1
32
+ end
33
+ end
34
+
35
+ simhash = 0
36
+ HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
37
+
38
+ simhash
39
+ end
40
+
41
+ def hamming_distance(simhash1, simhash2)
42
+ (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
43
+ end
44
+
45
+ private
46
+
47
+ def simple_string_hash(str, length)
48
+ return 0 if str == ''
49
+
50
+ x = str.bytes.first << 7
51
+ m = 1_000_003
52
+ mask = (1 << length) - 1
53
+ str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
54
+
55
+ x ^= str.bytes.count
56
+ x = -2 if x == -1
57
+
58
+ x.to_i
59
+ end
60
+
61
+ def filter_tokens(tokens, options)
62
+ tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
63
+ tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
64
+ tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
65
+ tokens.map!(&:stem) if options[:stemming]
66
+ tokens.uniq! if options[:unique]
67
+ end
68
+ end
data/simhash2.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ require File.expand_path('../lib/simhash/version', __FILE__)
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'simhash2'
5
+ spec.version = Simhash::VERSION
6
+ spec.authors = ['Jonathan Wong']
7
+ spec.email = ['jonathan@armchairtheorist.com']
8
+ spec.summary = 'A rewrite of the \'simhash\' gem, which is an implementation of Moses Charikar\'s simhashes in Ruby.'
9
+ spec.homepage = 'http://github.com/armchairtheorist/simhash'
10
+ spec.license = 'MIT'
11
+
12
+ spec.files = `git ls-files`.split("\n")
13
+ spec.test_files = `git ls-files -- {spec}/*`.split("\n")
14
+ spec.require_paths = ['lib']
15
+
16
+ spec.add_development_dependency 'rspec', '~> 0'
17
+ spec.add_development_dependency 'rake', '~> 0'
18
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+ require 'fast-stemmer'
3
+
4
+ describe Simhash do
5
+ it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
+ str1 = 'I like going to the beach'
7
+ str2 = 'I like going to the beach'
8
+ str3 = 'I like going to the mall'
9
+
10
+ expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
+ end
13
+
14
+ it 'should strip punctuation and capitalization properly' do
15
+ str1 = "Hello, nurse! How's it going today... my man?"
16
+ str2 = 'hello nurse hows it going today my man'
17
+
18
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
19
+ end
20
+
21
+ it "should respect the 'unique' option" do
22
+ str1 = 'apple pear'
23
+ str2 = 'apple apple apple pear'
24
+
25
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
26
+ expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
27
+ end
28
+
29
+ it "should respect the 'stop_words' option" do
30
+ str1 = 'I like the man on the moon.'
31
+ str2 = 'like man moon'
32
+ stop_words = %w[i the on]
33
+
34
+ expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
35
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
36
+ end
37
+
38
+ it "should respect the 'stemming' option" do
39
+ str1 = 'My crazy cars have crazy minds!'
40
+ str2 = 'My crazi car have crazi mind!'
41
+
42
+ expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
43
+ expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
44
+ end
45
+
46
+ it 'should calculate hamming distances correctly' do
47
+ expect(Simhash.hamming_distance(2, 2)).to eq 0
48
+ expect(Simhash.hamming_distance(2, 3)).to eq 1
49
+ expect(Simhash.hamming_distance(255, 197)).to eq 4
50
+ end
51
+ end
@@ -0,0 +1,99 @@
1
+ require 'coveralls'
2
+ Coveralls.wear!
3
+
4
+ require 'simhash'
5
+
6
+ # This file was generated by the `rspec --init` command. Conventionally, all
7
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
9
+ # this file to always be loaded, without a need to explicitly require it in any
10
+ # files.
11
+ #
12
+ # Given that it is always loaded, you are encouraged to keep this file as
13
+ # light-weight as possible. Requiring heavyweight dependencies from this file
14
+ # will add to the boot time of your test suite on EVERY test run, even for an
15
+ # individual file that may not need all of that loaded. Instead, consider making
16
+ # a separate helper file that requires the additional dependencies and performs
17
+ # the additional setup, and require it from the spec files that actually need
18
+ # it.
19
+ #
20
+ # The `.rspec` file also contains a few flags that are not defaults but that
21
+ # users commonly want.
22
+ #
23
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
24
+ RSpec.configure do |config|
25
+ # rspec-expectations config goes here. You can use an alternate
26
+ # assertion/expectation library such as wrong or the stdlib/minitest
27
+ # assertions if you prefer.
28
+ config.expect_with :rspec do |expectations|
29
+ # This option will default to `true` in RSpec 4. It makes the `description`
30
+ # and `failure_message` of custom matchers include text for helper methods
31
+ # defined using `chain`, e.g.:
32
+ # be_bigger_than(2).and_smaller_than(4).description
33
+ # # => "be bigger than 2 and smaller than 4"
34
+ # ...rather than:
35
+ # # => "be bigger than 2"
36
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
37
+ end
38
+
39
+ # rspec-mocks config goes here. You can use an alternate test double
40
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
41
+ config.mock_with :rspec do |mocks|
42
+ # Prevents you from mocking or stubbing a method that does not exist on
43
+ # a real object. This is generally recommended, and will default to
44
+ # `true` in RSpec 4.
45
+ mocks.verify_partial_doubles = true
46
+ end
47
+
48
+ # The settings below are suggested to provide a good initial experience
49
+ # with RSpec, but feel free to customize to your heart's content.
50
+ # # These two settings work together to allow you to limit a spec run
51
+ # # to individual examples or groups you care about by tagging them with
52
+ # # `:focus` metadata. When nothing is tagged with `:focus`, all examples
53
+ # # get run.
54
+ # config.filter_run :focus
55
+ # config.run_all_when_everything_filtered = true
56
+ #
57
+ # # Allows RSpec to persist some state between runs in order to support
58
+ # # the `--only-failures` and `--next-failure` CLI options. We recommend
59
+ # # you configure your source control system to ignore this file.
60
+ # config.example_status_persistence_file_path = "spec/examples.txt"
61
+ #
62
+ # # Limits the available syntax to the non-monkey patched syntax that is
63
+ # # recommended. For more details, see:
64
+ # # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
65
+ # # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
66
+ # # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
67
+ # config.disable_monkey_patching!
68
+ #
69
+ # # This setting enables warnings. It's recommended, but in some cases may
70
+ # # be too noisy due to issues in dependencies.
71
+ # config.warnings = true
72
+ #
73
+ # # Many RSpec users commonly either run the entire suite or an individual
74
+ # # file, and it's useful to allow more verbose output when running an
75
+ # # individual spec file.
76
+ # if config.files_to_run.one?
77
+ # # Use the documentation formatter for detailed output,
78
+ # # unless a formatter has already been configured
79
+ # # (e.g. via a command-line flag).
80
+ # config.default_formatter = 'doc'
81
+ # end
82
+ #
83
+ # # Print the 10 slowest examples and example groups at the
84
+ # # end of the spec run, to help surface which specs are running
85
+ # # particularly slow.
86
+ # config.profile_examples = 10
87
+ #
88
+ # # Run specs in random order to surface order dependencies. If you find an
89
+ # # order dependency and want to debug it, you can fix the order by providing
90
+ # # the seed, which is printed after each run.
91
+ # # --seed 1234
92
+ # config.order = :random
93
+ #
94
+ # # Seed global randomization in this process using the `--seed` CLI option.
95
+ # # Setting this allows you to use `--seed` to deterministically reproduce
96
+ # # test failures related to randomization by passing the same `--seed` value
97
+ # # as the one that triggered the failure.
98
+ # Kernel.srand config.seed
99
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhash2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Wong
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description:
42
+ email:
43
+ - jonathan@armchairtheorist.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".coveralls.yml"
49
+ - ".gitignore"
50
+ - ".rspec"
51
+ - ".rubocop.yml"
52
+ - ".travis.yml"
53
+ - Gemfile
54
+ - LICENSE.txt
55
+ - README.md
56
+ - Rakefile
57
+ - lib/simhash.rb
58
+ - lib/simhash/version.rb
59
+ - simhash2.gemspec
60
+ - spec/simhash_spec.rb
61
+ - spec/spec_helper.rb
62
+ homepage: http://github.com/armchairtheorist/simhash
63
+ licenses:
64
+ - MIT
65
+ metadata: {}
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 2.6.11
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: A rewrite of the 'simhash' gem, which is an implementation of Moses Charikar's
86
+ simhashes in Ruby.
87
+ test_files: []