minwise 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +50 -0
- data/Rakefile +30 -0
- data/ext/minwise/extconf.rb +10 -0
- data/ext/minwise/minwise.c +115 -0
- data/ext/minwise/minwise.h +6 -0
- data/lib/minwise/minhash.rb +80 -0
- data/lib/minwise/version.rb +5 -0
- data/lib/minwise.rb +20 -0
- data/sig/minwise.rbs +4 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 24510ece26235c8ef8956326e2d05d9162b70b4bede324ab6e548d2746194b8a
|
4
|
+
data.tar.gz: b1606b461d02f4a420daf8e326ce75e76af5ce7cf0bbf43f7064247c0ea0b968
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7867b8181456107e42783cb7d1e2a62827710e8bfd5e9f4cd94d9521fbee069841c19b8a636d5dba477c4211d4bd2134d4cad1bdfcc9322e714853d696250959
|
7
|
+
data.tar.gz: e43b8b14d8cc308ce0b752c8de667d163f29905398b0201cf255b5990e53b5b5d1d67f75b3596a752631bc48c192b0fde3413a7a73b5f3f4dbef56fbd70e343c
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rake
|
3
|
+
- rubocop-minitest
|
4
|
+
|
5
|
+
AllCops:
|
6
|
+
TargetRubyVersion: 2.6
|
7
|
+
NewCops: enable
|
8
|
+
|
9
|
+
Style/StringLiterals:
|
10
|
+
Enabled: true
|
11
|
+
EnforcedStyle: double_quotes
|
12
|
+
|
13
|
+
Style/StringLiteralsInInterpolation:
|
14
|
+
Enabled: true
|
15
|
+
EnforcedStyle: double_quotes
|
16
|
+
|
17
|
+
Layout/LineLength:
|
18
|
+
Max: 120
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2023 Samuel Scully
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Minwise
|
2
|
+
|
3
|
+
Fast locality sensitive hashes using the minhash algorithm.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Install the gem and add to the application's Gemfile by executing:
|
8
|
+
|
9
|
+
$ bundle add minwise
|
10
|
+
|
11
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
12
|
+
|
13
|
+
$ gem install minwise
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
Generate the minhash of a string:
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
Minwise::Minhash.digest("Chunky bacon")
|
21
|
+
# => [437974493, 147728091, 1185236492, ...]
|
22
|
+
```
|
23
|
+
|
24
|
+
Generate a minhash with options:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
Minwise::Minhash.digest("Chunky bacon", shingle_size: 9, hash_size: 500, seed: 42)
|
28
|
+
# => [203094719, 599941115, 1256960069, ...]
|
29
|
+
```
|
30
|
+
|
31
|
+
You can also generate a minhash of a bare set of integers:
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
Minwise::Minhash.digest([1, 2, 3])
|
35
|
+
# => [1005141192, 713750329, 346603495, ...]
|
36
|
+
```
|
37
|
+
|
38
|
+
## Development
|
39
|
+
|
40
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
41
|
+
|
42
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
43
|
+
|
44
|
+
## Contributing
|
45
|
+
|
46
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/sbscully/minwise.
|
47
|
+
|
48
|
+
## License
|
49
|
+
|
50
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
require "rake/testtask"
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.libs << "lib"
|
9
|
+
t.test_files = FileList["test/**/test_*.rb"]
|
10
|
+
end
|
11
|
+
|
12
|
+
require "rubocop/rake_task"
|
13
|
+
|
14
|
+
RuboCop::RakeTask.new
|
15
|
+
|
16
|
+
require "rake/extensiontask"
|
17
|
+
|
18
|
+
desc "Compile all the extensions"
|
19
|
+
task build: :compile
|
20
|
+
|
21
|
+
Rake::ExtensionTask.new("minwise") do |ext|
|
22
|
+
ext.lib_dir = "lib/minwise"
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "Benchmark Minwise::Minhash.digest"
|
26
|
+
task benchmark: :build do
|
27
|
+
ruby "./test/benchmark.rb"
|
28
|
+
end
|
29
|
+
|
30
|
+
task default: %i[clobber compile test rubocop]
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "mkmf"
|
4
|
+
|
5
|
+
# Makes all symbols private by default to avoid unintended conflict
|
6
|
+
# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
|
7
|
+
# selectively, or entirely remove this flag.
|
8
|
+
append_cflags("-fvisibility=hidden")
|
9
|
+
|
10
|
+
create_makefile("minwise/minwise")
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include "minwise.h"
|
3
|
+
|
4
|
+
static VALUE mMinwise;
|
5
|
+
static VALUE cMinwiseMinhash;
|
6
|
+
|
7
|
+
static uint64_t randr(uint64_t min, uint64_t max, uint64_t seed) {
|
8
|
+
// xorshift*
|
9
|
+
uint64_t x = seed + 1; // must not be zero
|
10
|
+
x ^= x >> 12;
|
11
|
+
x ^= x << 25;
|
12
|
+
x ^= x >> 27;
|
13
|
+
x *= 0x2545F4914F6CDD1DULL;
|
14
|
+
|
15
|
+
return (uint64_t)(min + (double)x / UINT64_MAX * (max - min));
|
16
|
+
}
|
17
|
+
|
18
|
+
static void minhash(const uint32_t* set, const size_t set_len, uint32_t* hash, const size_t hash_len, uint64_t seed) {
|
19
|
+
const uint64_t p = 4294967311; // first prime greater than UINT32_MAX
|
20
|
+
uint64_t a, b;
|
21
|
+
uint32_t x, h;
|
22
|
+
|
23
|
+
for (size_t i = 0; i < hash_len; i++) {
|
24
|
+
a = randr(0, p, seed + i);
|
25
|
+
a = (a / 2) * 2 - 1; // must be odd
|
26
|
+
b = randr(0, p, a);
|
27
|
+
|
28
|
+
h = UINT32_MAX;
|
29
|
+
for (size_t j = 0; j < set_len; j++) {
|
30
|
+
x = (uint32_t)(((a * set[j] + b) % p) % UINT32_MAX);
|
31
|
+
|
32
|
+
if (x < h) {
|
33
|
+
h = x;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
hash[i] = h;
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
static VALUE c_minhash(VALUE self, VALUE rb_set, VALUE rb_hash_len, VALUE rb_hash_seed) {
|
42
|
+
Check_Type(rb_set, T_ARRAY);
|
43
|
+
|
44
|
+
size_t rb_set_len = RARRAY_LEN(rb_set);
|
45
|
+
size_t c_hash_len = NUM2SIZET(rb_hash_len);
|
46
|
+
uint64_t c_hash_seed = NUM2ULONG(rb_hash_seed);
|
47
|
+
|
48
|
+
uint32_t *c_set = (uint32_t *)malloc(rb_set_len * sizeof(uint32_t));
|
49
|
+
for (size_t i = 0; i < rb_set_len; i++) {
|
50
|
+
c_set[i] = NUM2UINT(rb_ary_entry(rb_set, i));
|
51
|
+
}
|
52
|
+
|
53
|
+
uint32_t *c_hash = (uint32_t *)malloc(c_hash_len * sizeof(uint32_t));
|
54
|
+
minhash(c_set, rb_set_len, c_hash, c_hash_len, c_hash_seed);
|
55
|
+
|
56
|
+
VALUE rb_hash = rb_ary_new_capa(c_hash_len);
|
57
|
+
for (size_t i = 0; i < c_hash_len; i++) {
|
58
|
+
rb_ary_store(rb_hash, i, UINT2NUM(c_hash[i]));
|
59
|
+
}
|
60
|
+
|
61
|
+
free(c_set);
|
62
|
+
free(c_hash);
|
63
|
+
|
64
|
+
return rb_hash;
|
65
|
+
}
|
66
|
+
|
67
|
+
static uint32_t fnv1a(char *str) {
|
68
|
+
uint32_t hash = 0x811c9dc5;
|
69
|
+
unsigned char *s = (unsigned char *)str;
|
70
|
+
|
71
|
+
while (*s) {
|
72
|
+
hash ^= (uint32_t)*s++;
|
73
|
+
hash *= 0x01000193;
|
74
|
+
}
|
75
|
+
|
76
|
+
return hash;
|
77
|
+
}
|
78
|
+
|
79
|
+
static VALUE c_tokenize(VALUE self, VALUE rb_string, VALUE rb_shingle_size) {
|
80
|
+
char *c_string = StringValueCStr(rb_string);
|
81
|
+
size_t c_string_len = RSTRING_LEN(rb_string);
|
82
|
+
size_t c_shingle_size = NUM2SIZET(rb_shingle_size);
|
83
|
+
|
84
|
+
if (c_string_len <= c_shingle_size) {
|
85
|
+
VALUE rb_tokens = rb_ary_new_capa(1);
|
86
|
+
rb_ary_store(rb_tokens, 0, UINT2NUM(fnv1a(c_string)));
|
87
|
+
|
88
|
+
return rb_tokens;
|
89
|
+
}
|
90
|
+
|
91
|
+
size_t rb_tokens_len = c_string_len - c_shingle_size + 1;
|
92
|
+
VALUE rb_tokens = rb_ary_new_capa(rb_tokens_len);
|
93
|
+
|
94
|
+
char buffer[c_shingle_size + 1];
|
95
|
+
for (size_t i = 0; i < rb_tokens_len; i++) {
|
96
|
+
for (size_t j = 0; j < c_shingle_size; j++) {
|
97
|
+
buffer[j] = c_string[i + j];
|
98
|
+
}
|
99
|
+
buffer[c_shingle_size + 1] = 0;
|
100
|
+
|
101
|
+
rb_ary_store(rb_tokens, i, UINT2NUM(fnv1a(buffer)));
|
102
|
+
}
|
103
|
+
|
104
|
+
return rb_tokens;
|
105
|
+
}
|
106
|
+
|
107
|
+
RUBY_FUNC_EXPORTED void
|
108
|
+
Init_minwise(void)
|
109
|
+
{
|
110
|
+
mMinwise = rb_define_module("Minwise");
|
111
|
+
cMinwiseMinhash = rb_define_class_under(mMinwise, "Minhash", rb_cObject);
|
112
|
+
|
113
|
+
rb_define_singleton_method(cMinwiseMinhash, "__hash", c_minhash, 3);
|
114
|
+
rb_define_singleton_method(cMinwiseMinhash, "__tokenize", c_tokenize, 2);
|
115
|
+
}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Minwise
|
4
|
+
# The classic minhash algorithm.
|
5
|
+
#
|
6
|
+
# The minhash digests of two arrays will share approximately the same
|
7
|
+
# proportion of elements as the underlying arrays. This is useful for finding
|
8
|
+
# similar items in large datasets.
|
9
|
+
#
|
10
|
+
# The interface is similar to classes in the `Digest` module of the standard
|
11
|
+
# library.
|
12
|
+
#
|
13
|
+
# Minwise::Minhash.digest([1, 2, 3])
|
14
|
+
# # => [1005141192, 713750329, 346603495, ...]
|
15
|
+
#
|
16
|
+
# String inputs are first converted to an array of fixed width chunks of
|
17
|
+
# characters called "shingles". For example `"Chunky"` with a shingle
|
18
|
+
# size of 3 would become `["Chu", "hun", "unk", "nky"]`, then that array is
|
19
|
+
# used to generate the minhash. The `shingle_size` option controls the chunk
|
20
|
+
# width used.
|
21
|
+
#
|
22
|
+
# Minwise::Minhash.digest("Chunky bacon", shingle_size: 3)
|
23
|
+
# # => [437974493, 147728091, 1185236492, ...]
|
24
|
+
#
|
25
|
+
# The size of the output `hash_size` and `seed` for generating the hash can
|
26
|
+
# also be set in options. Larger minhashes will give more accurate estimates
|
27
|
+
# of similarity between items, but are slower to generate and take more space.
|
28
|
+
#
|
29
|
+
# Minwise::Minhash.digest("Chunky bacon", hash_size: 900, seed: 84)
|
30
|
+
# # => [355390344, 825885127, 262059926, ...]
|
31
|
+
#
|
32
|
+
# When comparing two minhashes all the options used to generate the minhashes
|
33
|
+
# must be identical for the comparison to be meaningful.
|
34
|
+
#
|
35
|
+
# Detailed information on how the minhash algorithm works and how minhashes
|
36
|
+
# can be used can be found in "Chapter 3: Finding Similar Items" of the book
|
37
|
+
# "Mining of Massive Datasets", by Leskovec, Rajaraman, and Ulman, available
|
38
|
+
# for free at http://www.mmds.org/.
|
39
|
+
#
|
40
|
+
class Minhash
|
41
|
+
DEFAULT_OPTIONS = {
|
42
|
+
hash_size: 128,
|
43
|
+
shingle_size: 5,
|
44
|
+
seed: 3_141_592
|
45
|
+
}.freeze
|
46
|
+
|
47
|
+
def self.digest(data, options = {})
|
48
|
+
new(data, options).digest
|
49
|
+
end
|
50
|
+
|
51
|
+
def initialize(data = [], options = {})
|
52
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
53
|
+
@data = parse(data)
|
54
|
+
end
|
55
|
+
|
56
|
+
def update(element)
|
57
|
+
@data << element
|
58
|
+
end
|
59
|
+
|
60
|
+
def digest
|
61
|
+
raise ArgumentError, "input must not be empty" if @data.empty?
|
62
|
+
|
63
|
+
self.class.__hash(@data, @options[:hash_size], @options[:seed])
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def parse(data)
|
69
|
+
return [] if data.empty?
|
70
|
+
|
71
|
+
if data.respond_to?(:to_a)
|
72
|
+
data
|
73
|
+
elsif data.respond_to?(:to_str)
|
74
|
+
self.class.__tokenize(data.to_str, @options[:shingle_size])
|
75
|
+
else
|
76
|
+
raise ArgumentError, "input must be a string or array of integers"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/minwise.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "minwise/version"
|
4
|
+
require_relative "minwise/minwise"
|
5
|
+
require_relative "minwise/minhash"
|
6
|
+
|
7
|
+
# A Ruby library for generating minwise hashes.
|
8
|
+
module Minwise
|
9
|
+
class Error < StandardError; end
|
10
|
+
|
11
|
+
class << self
|
12
|
+
# Returns the Jaccard similarity of 2 arrays, a number between 0.0 and 1.0.
|
13
|
+
#
|
14
|
+
# The arrays are treated as sets, i.e. duplicate elements in an array are
|
15
|
+
# only counted once.
|
16
|
+
def similarity(set_one, set_two)
|
17
|
+
set_one.intersection(set_two).length / set_one.union(set_two).length.to_f
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/sig/minwise.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: minwise
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Samuel Scully
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-08-17 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email:
|
15
|
+
- sbscully@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions:
|
18
|
+
- ext/minwise/extconf.rb
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- ".rubocop.yml"
|
22
|
+
- CHANGELOG.md
|
23
|
+
- LICENSE.txt
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- ext/minwise/extconf.rb
|
27
|
+
- ext/minwise/minwise.c
|
28
|
+
- ext/minwise/minwise.h
|
29
|
+
- lib/minwise.rb
|
30
|
+
- lib/minwise/minhash.rb
|
31
|
+
- lib/minwise/version.rb
|
32
|
+
- sig/minwise.rbs
|
33
|
+
homepage: https://github.com/sbscully/minwise
|
34
|
+
licenses:
|
35
|
+
- MIT
|
36
|
+
metadata:
|
37
|
+
rubygems_mfa_required: 'true'
|
38
|
+
homepage_uri: https://github.com/sbscully/minwise
|
39
|
+
source_code_uri: https://github.com/sbscully/minwise
|
40
|
+
changelog_uri: https://github.com/sbscully/minwise/blob/main/CHANGELOG.md
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 2.6.0
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubygems_version: 3.3.7
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Fast locality sensitive hashes using minwise hashing and derivatives.
|
60
|
+
test_files: []
|