segno 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 67f6327b27ffe04393f1ad13a7838a0b6e3ce24b
4
+ data.tar.gz: 2f81782ef6e507a62221fb083a0fd3a5ba1d0f4c
5
+ SHA512:
6
+ metadata.gz: 84de431e555d2026c5159d0eb627d7e9512906486c85580ffee31e6acc3e47e5b8110b7ff62125e8d1f9c7f8d3d56d2ed1ddbe2abd40683d3aa7cad76d98dd75
7
+ data.tar.gz: 6bea7812f4b3f0357f04ee183dad49b0f2c235bc36c6e8a9b7ec4faee1b4f0561ffbf8fa6dc40189866912d6c032672a6544158436924e58d9fccef40bdb86eb
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .idea
7
+ .env*
8
+ Gemfile.lock
9
+ InstalledFiles
10
+ _yardoc
11
+ coverage
12
+ doc/
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+ vendor/bundle
21
+ /spec/fixtures/vcr_cassettes
22
+ .DS_Store
data/CHANGELOG.md ADDED
@@ -0,0 +1,25 @@
1
+ # 変更履歴
2
+
3
+ ## バージョニング規則
4
+
5
+ バージョン番号は MAJOR.MINOR.PATCH と表記し、
6
+ 以下の3つの法則によって更新される。
7
+ このとき MAJOR, MINOR, PATCH は非負の整数とし、頭にゼロは入れず、それぞれの要素は数値的に増加する。
8
+
9
+ 1. 互換性のない API の変更を行うときに MAJOR バージョンをインクリメントする
10
+ 2. 後方互換性のある方法で機能性を追加したときに MINOR バージョンをインクリメントする
11
+ 3. そして、後方互換性のあるバグ フィックスをしたときに PATCH バージョンをインクリメントする
12
+
13
+ メジャー バージョン 0 (0.y.z) は、初期の開発のためのものであり、
14
+ いつでもどんなものでも変化する可能性があるため、
15
+ パブリック API は安定的と考えられるべきではない。
16
+
17
+ バージョン 1.0.0 はパブリック API を定義する。
18
+ このリリースの後でバージョン番号がインクリメントされることは、
19
+ このパブリック API とそれの変更がどの程度であるかに左右される。
20
+
21
+ [参考:Semantic Versioning](http://semver.org/)
22
+
23
+
24
+ 0.0.1
25
+ ---
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in minhash.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Shogo Kawaguchi
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,28 @@
1
+ # Segno
2
+
3
+ An implementation of the b bit MinHash algorithm in ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'segno'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install segno
18
+
19
+ ## Usage
20
+
21
+
22
+ ## Contributing
23
+
24
+ 1. Fork it ( https://github.com/k-shogo/b-bit-minhash/fork )
25
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
26
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
27
+ 4. Push to the branch (`git push origin my-new-feature`)
28
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/lib/segno.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'segno/version'
2
+ require 'segno/configuration'
3
+ require 'segno/generator'
4
+ require 'segno/hash_vec'
5
+ require 'json'
6
+ require 'murmurhash3'
7
+
8
+ module Segno
9
+ extend Configuration
10
+
11
+ def self.generator(options = {})
12
+ Segno::Generator.new(options)
13
+ end
14
+
15
+ def self.method_missing(method_name, *args, &block)
16
+ return super unless generator.respond_to?(method_name)
17
+ generator.send(method_name, *args, &block)
18
+ end
19
+
20
+ def self.respond_to?(method_name, include_private = false)
21
+ return generator.respond_to?(method_name, include_private) || super
22
+ end
23
+ end
@@ -0,0 +1,41 @@
1
+ module Segno
2
+
3
+ module Configuration
4
+ VALID_OPTIONS_KEYS = [
5
+ :b,
6
+ :k,
7
+ :seed,
8
+ :seed_vec
9
+ ].freeze
10
+
11
+ DEFAULT_B = 1
12
+ DEFAULT_K = 128
13
+ DEFAULT_SEED = 2090358822
14
+ DEFAULT_SEED_VEC = nil
15
+
16
+ attr_accessor *VALID_OPTIONS_KEYS
17
+
18
+ def self.extended(base)
19
+ base.reset
20
+ end
21
+
22
+ def configure
23
+ yield self
24
+ self
25
+ end
26
+
27
+ def options
28
+ VALID_OPTIONS_KEYS.inject({}) do |options, key|
29
+ options.merge!(key => send(key))
30
+ end
31
+ end
32
+
33
+ def reset
34
+ self.b = DEFAULT_B
35
+ self.k = DEFAULT_K
36
+ self.seed = DEFAULT_SEED
37
+ self.seed_vec = DEFAULT_SEED_VEC
38
+ self
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,19 @@
1
+ require 'segno/seed'
2
+ require 'segno/minhash'
3
+ module Segno
4
+ class Generator
5
+ include Seed
6
+ include MinHash
7
+
8
+ attr_accessor *Configuration::VALID_OPTIONS_KEYS
9
+
10
+ def initialize(options = {})
11
+ options = Segno.options.merge(options)
12
+ Configuration::VALID_OPTIONS_KEYS.each do |key|
13
+ send("#{key}=", options[key])
14
+ end
15
+ self.seed_vec ||= gen_seed self.k
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,41 @@
1
+ module Segno
2
+ class HashVec
3
+ attr_accessor :vec
4
+ VALID_SERIALIZE_PATTERN = /\A(?<b>[0-9]{1,})\.(?<k>[0-9]{1,})\.(?<hash>[0-9a-f]{1,})\z/
5
+
6
+ def initialize vec
7
+ @vec = vec
8
+ end
9
+
10
+ def to_s
11
+ @vec.join.to_i(2).to_s(16)
12
+ end
13
+
14
+ def b
15
+ @vec.first.size
16
+ end
17
+
18
+ def k
19
+ @vec.size
20
+ end
21
+
22
+ def jaccard hash_vec
23
+ n = [self.vec, hash_vec.vec].transpose.map{|v| v[0]==v[1]}.select{|x| x}.size
24
+ (2 ** self.b * Rational(n,k) - 1) / (2 ** self.b - 1)
25
+ end
26
+
27
+ def self.dump hash_vec
28
+ [hash_vec.b, hash_vec.k, hash_vec.to_s].join('.')
29
+ end
30
+
31
+ def self.load string
32
+ if key = string.match(VALID_SERIALIZE_PATTERN)
33
+ b = key['b'].to_i
34
+ k = key['k'].to_i
35
+ hash = key['hash']
36
+ vec = sprintf("%0#{b*k}d", hash.to_i(16).to_s(2)).scan(/.{1,#{b}}/)
37
+ new vec
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,20 @@
1
+ module Segno
2
+ module MinHash
3
+ def minhash targets, seed
4
+ hash_values = []
5
+ targets.each do |str|
6
+ hash_values << MurmurHash3::V32.str_hash(str, seed)
7
+ end
8
+ hash_values.min
9
+ end
10
+
11
+ def bbit_minhash targets, seed
12
+ sprintf("%0#{b}d", minhash(targets, seed).to_s(2))[-b..-1]
13
+ end
14
+
15
+ def bbit_minhash_vec targets
16
+ vec = seed_vec.map{|seed| bbit_minhash targets, seed}
17
+ Segno::HashVec.new vec
18
+ end
19
+ end
20
+ end
data/lib/segno/seed.rb ADDED
@@ -0,0 +1,14 @@
1
+ module Segno
2
+ module Seed
3
+
4
+ def gen_seed length
5
+ seeds = [seed]
6
+ length.times do
7
+ seeds << MurmurHash3::V32.int32_hash(seeds.last)
8
+ end
9
+ seeds.shift
10
+ seeds
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,3 @@
1
+ module Segno
2
+ VERSION = '0.0.1'
3
+ end
data/segno.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'segno/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "segno"
8
+ spec.version = Segno::VERSION
9
+ spec.authors = ["Shogo Kawaguchi"]
10
+ spec.email = ["platycod0n.ramosa@gmail.com"]
11
+ spec.summary = %q{An implementation of the b bit MinHash algorithm in ruby}
12
+ spec.description = %q{An implementation of the b bit MinHash algorithm in ruby}
13
+ spec.homepage = "https://github.com/k-shogo/segno"
14
+ spec.license = "MIT"
15
+
16
+ spec.add_runtime_dependency 'murmurhash3'
17
+
18
+ spec.add_development_dependency "bundler", "~> 1.6"
19
+ spec.add_development_dependency "rake"
20
+ spec.add_development_dependency "rspec"
21
+ spec.add_development_dependency "simplecov"
22
+ spec.add_development_dependency "pry"
23
+
24
+ spec.files = `git ls-files -z`.split("\x0")
25
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
26
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
27
+ spec.require_paths = ["lib"]
28
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Segno::Generator do
6
+
7
+ context '#respond_to?'do
8
+ it '存在するメソッドでtrueが返る' do
9
+ expect(Segno.respond_to?(:configure)).to be_true
10
+ end
11
+
12
+ it '存在しないメソッドでfalseが帰る' do
13
+ expect(Segno.respond_to?(:hoge)).to be_false
14
+ end
15
+ end
16
+
17
+ # it 'メソッドを呼び出せる' do
18
+ # expect(Segno.api_key('user_id')).to_not be_nil
19
+ # end
20
+
21
+ end
@@ -0,0 +1,13 @@
1
+ require 'simplecov'
2
+ SimpleCov.start do
3
+ add_filter 'spec'
4
+ add_filter 'vendor'
5
+ end
6
+
7
+ require 'segno'
8
+ require 'rspec'
9
+
10
+ RSpec.configure do |config|
11
+ config.mock_framework = :rspec
12
+ end
13
+
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: segno
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Shogo Kawaguchi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: murmurhash3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: An implementation of the b bit MinHash algorithm in ruby
98
+ email:
99
+ - platycod0n.ramosa@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - CHANGELOG.md
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - lib/segno.rb
111
+ - lib/segno/configuration.rb
112
+ - lib/segno/generator.rb
113
+ - lib/segno/hash_vec.rb
114
+ - lib/segno/minhash.rb
115
+ - lib/segno/seed.rb
116
+ - lib/segno/version.rb
117
+ - segno.gemspec
118
+ - spec/generator_spec.rb
119
+ - spec/spec_helper.rb
120
+ homepage: https://github.com/k-shogo/segno
121
+ licenses:
122
+ - MIT
123
+ metadata: {}
124
+ post_install_message:
125
+ rdoc_options: []
126
+ require_paths:
127
+ - lib
128
+ required_ruby_version: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ required_rubygems_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ requirements: []
139
+ rubyforge_project:
140
+ rubygems_version: 2.0.3
141
+ signing_key:
142
+ specification_version: 4
143
+ summary: An implementation of the b bit MinHash algorithm in ruby
144
+ test_files:
145
+ - spec/generator_spec.rb
146
+ - spec/spec_helper.rb