minwise 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 24510ece26235c8ef8956326e2d05d9162b70b4bede324ab6e548d2746194b8a
4
+ data.tar.gz: b1606b461d02f4a420daf8e326ce75e76af5ce7cf0bbf43f7064247c0ea0b968
5
+ SHA512:
6
+ metadata.gz: 7867b8181456107e42783cb7d1e2a62827710e8bfd5e9f4cd94d9521fbee069841c19b8a636d5dba477c4211d4bd2134d4cad1bdfcc9322e714853d696250959
7
+ data.tar.gz: e43b8b14d8cc308ce0b752c8de667d163f29905398b0201cf255b5990e53b5b5d1d67f75b3596a752631bc48c192b0fde3413a7a73b5f3f4dbef56fbd70e343c
data/.rubocop.yml ADDED
@@ -0,0 +1,18 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-minitest
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.6
7
+ NewCops: enable
8
+
9
+ Style/StringLiterals:
10
+ Enabled: true
11
+ EnforcedStyle: double_quotes
12
+
13
+ Style/StringLiteralsInInterpolation:
14
+ Enabled: true
15
+ EnforcedStyle: double_quotes
16
+
17
+ Layout/LineLength:
18
+ Max: 120
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2023-08-14
4
+
5
+ - Initial release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Samuel Scully
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # Minwise
2
+
3
+ Fast locality sensitive hashes using the minhash algorithm.
4
+
5
+ ## Installation
6
+
7
+ Install the gem and add to the application's Gemfile by executing:
8
+
9
+ $ bundle add minwise
10
+
11
+ If bundler is not being used to manage dependencies, install the gem by executing:
12
+
13
+ $ gem install minwise
14
+
15
+ ## Usage
16
+
17
+ Generate the minhash of a string:
18
+
19
+ ```ruby
20
+ Minwise::Minhash.digest("Chunky bacon")
21
+ # => [437974493, 147728091, 1185236492, ...]
22
+ ```
23
+
24
+ Generate a minhash with options:
25
+
26
+ ```ruby
27
+ Minwise::Minhash.digest("Chunky bacon", shingle_size: 9, hash_size: 500, seed: 42)
28
+ # => [203094719, 599941115, 1256960069, ...]
29
+ ```
30
+
31
+ You can also generate a minhash of a bare set of integers:
32
+
33
+ ```ruby
34
+ Minwise::Minhash.digest([1, 2, 3])
35
+ # => [1005141192, 713750329, 346603495, ...]
36
+ ```
37
+
38
+ ## Development
39
+
40
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
41
+
42
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
43
+
44
+ ## Contributing
45
+
46
+ Bug reports and pull requests are welcome on GitHub at https://github.com/sbscully/minwise.
47
+
48
+ ## License
49
+
50
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/test_*.rb"]
10
+ end
11
+
12
+ require "rubocop/rake_task"
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ require "rake/extensiontask"
17
+
18
+ desc "Compile all the extensions"
19
+ task build: :compile
20
+
21
+ Rake::ExtensionTask.new("minwise") do |ext|
22
+ ext.lib_dir = "lib/minwise"
23
+ end
24
+
25
+ desc "Benchmark Minwise::Minhash.digest"
26
+ task benchmark: :build do
27
+ ruby "./test/benchmark.rb"
28
+ end
29
+
30
+ task default: %i[clobber compile test rubocop]
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ # Makes all symbols private by default to avoid unintended conflict
6
+ # with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
7
+ # selectively, or entirely remove this flag.
8
+ append_cflags("-fvisibility=hidden")
9
+
10
+ create_makefile("minwise/minwise")
@@ -0,0 +1,115 @@
1
+ #include <stdint.h>
2
+ #include "minwise.h"
3
+
4
+ static VALUE mMinwise;
5
+ static VALUE cMinwiseMinhash;
6
+
7
+ static uint64_t randr(uint64_t min, uint64_t max, uint64_t seed) {
8
+ // xorshift*
9
+ uint64_t x = seed + 1; // must not be zero
10
+ x ^= x >> 12;
11
+ x ^= x << 25;
12
+ x ^= x >> 27;
13
+ x *= 0x2545F4914F6CDD1DULL;
14
+
15
+ return (uint64_t)(min + (double)x / UINT64_MAX * (max - min));
16
+ }
17
+
18
+ static void minhash(const uint32_t* set, const size_t set_len, uint32_t* hash, const size_t hash_len, uint64_t seed) {
19
+ const uint64_t p = 4294967311; // first prime greater than UINT32_MAX
20
+ uint64_t a, b;
21
+ uint32_t x, h;
22
+
23
+ for (size_t i = 0; i < hash_len; i++) {
24
+ a = randr(0, p, seed + i);
25
+ a = (a / 2) * 2 - 1; // must be odd
26
+ b = randr(0, p, a);
27
+
28
+ h = UINT32_MAX;
29
+ for (size_t j = 0; j < set_len; j++) {
30
+ x = (uint32_t)(((a * set[j] + b) % p) % UINT32_MAX);
31
+
32
+ if (x < h) {
33
+ h = x;
34
+ }
35
+ }
36
+
37
+ hash[i] = h;
38
+ }
39
+ }
40
+
41
+ static VALUE c_minhash(VALUE self, VALUE rb_set, VALUE rb_hash_len, VALUE rb_hash_seed) {
42
+ Check_Type(rb_set, T_ARRAY);
43
+
44
+ size_t rb_set_len = RARRAY_LEN(rb_set);
45
+ size_t c_hash_len = NUM2SIZET(rb_hash_len);
46
+ uint64_t c_hash_seed = NUM2ULONG(rb_hash_seed);
47
+
48
+ uint32_t *c_set = (uint32_t *)malloc(rb_set_len * sizeof(uint32_t));
49
+ for (size_t i = 0; i < rb_set_len; i++) {
50
+ c_set[i] = NUM2UINT(rb_ary_entry(rb_set, i));
51
+ }
52
+
53
+ uint32_t *c_hash = (uint32_t *)malloc(c_hash_len * sizeof(uint32_t));
54
+ minhash(c_set, rb_set_len, c_hash, c_hash_len, c_hash_seed);
55
+
56
+ VALUE rb_hash = rb_ary_new_capa(c_hash_len);
57
+ for (size_t i = 0; i < c_hash_len; i++) {
58
+ rb_ary_store(rb_hash, i, UINT2NUM(c_hash[i]));
59
+ }
60
+
61
+ free(c_set);
62
+ free(c_hash);
63
+
64
+ return rb_hash;
65
+ }
66
+
67
+ static uint32_t fnv1a(char *str) {
68
+ uint32_t hash = 0x811c9dc5;
69
+ unsigned char *s = (unsigned char *)str;
70
+
71
+ while (*s) {
72
+ hash ^= (uint32_t)*s++;
73
+ hash *= 0x01000193;
74
+ }
75
+
76
+ return hash;
77
+ }
78
+
79
+ static VALUE c_tokenize(VALUE self, VALUE rb_string, VALUE rb_shingle_size) {
80
+ char *c_string = StringValueCStr(rb_string);
81
+ size_t c_string_len = RSTRING_LEN(rb_string);
82
+ size_t c_shingle_size = NUM2SIZET(rb_shingle_size);
83
+
84
+ if (c_string_len <= c_shingle_size) {
85
+ VALUE rb_tokens = rb_ary_new_capa(1);
86
+ rb_ary_store(rb_tokens, 0, UINT2NUM(fnv1a(c_string)));
87
+
88
+ return rb_tokens;
89
+ }
90
+
91
+ size_t rb_tokens_len = c_string_len - c_shingle_size + 1;
92
+ VALUE rb_tokens = rb_ary_new_capa(rb_tokens_len);
93
+
94
+ char buffer[c_shingle_size + 1];
95
+ for (size_t i = 0; i < rb_tokens_len; i++) {
96
+ for (size_t j = 0; j < c_shingle_size; j++) {
97
+ buffer[j] = c_string[i + j];
98
+ }
99
+ buffer[c_shingle_size + 1] = 0;
100
+
101
+ rb_ary_store(rb_tokens, i, UINT2NUM(fnv1a(buffer)));
102
+ }
103
+
104
+ return rb_tokens;
105
+ }
106
+
107
+ RUBY_FUNC_EXPORTED void
108
+ Init_minwise(void)
109
+ {
110
+ mMinwise = rb_define_module("Minwise");
111
+ cMinwiseMinhash = rb_define_class_under(mMinwise, "Minhash", rb_cObject);
112
+
113
+ rb_define_singleton_method(cMinwiseMinhash, "__hash", c_minhash, 3);
114
+ rb_define_singleton_method(cMinwiseMinhash, "__tokenize", c_tokenize, 2);
115
+ }
@@ -0,0 +1,6 @@
1
+ #ifndef MINWISE_H
2
+ #define MINWISE_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ #endif /* MINWISE_H */
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Minwise
4
+ # The classic minhash algorithm.
5
+ #
6
+ # The minhash digests of two arrays will share approximately the same
7
+ # proportion of elements as the underlying arrays. This is useful for finding
8
+ # similar items in large datasets.
9
+ #
10
+ # The interface is similar to classes in the `Digest` module of the standard
11
+ # library.
12
+ #
13
+ # Minwise::Minhash.digest([1, 2, 3])
14
+ # # => [1005141192, 713750329, 346603495, ...]
15
+ #
16
+ # String inputs are first converted to an array of fixed width chunks of
17
+ # characters called "shingles". For example `"Chunky"` with a shingle
18
+ # size of 3 would become `["Chu", "hun", "unk", "nky"]`, then that array is
19
+ # used to generate the minhash. The `shingle_size` option controls the chunk
20
+ # width used.
21
+ #
22
+ # Minwise::Minhash.digest("Chunky bacon", shingle_size: 3)
23
+ # # => [437974493, 147728091, 1185236492, ...]
24
+ #
25
+ # The size of the output `hash_size` and `seed` for generating the hash can
26
+ # also be set in options. Larger minhashes will give more accurate estimates
27
+ # of similarity between items, but are slower to generate and take more space.
28
+ #
29
+ # Minwise::Minhash.digest("Chunky bacon", hash_size: 900, seed: 84)
30
+ # # => [355390344, 825885127, 262059926, ...]
31
+ #
32
+ # When comparing two minhashes all the options used to generate the minhashes
33
+ # must be identical for the comparison to be meaningful.
34
+ #
35
+ # Detailed information on how the minhash algorithm works and how minhashes
36
+ # can be used can be found in "Chapter 3: Finding Similar Items" of the book
37
+ # "Mining of Massive Datasets", by Leskovec, Rajaraman, and Ulman, available
38
+ # for free at http://www.mmds.org/.
39
+ #
40
+ class Minhash
41
+ DEFAULT_OPTIONS = {
42
+ hash_size: 128,
43
+ shingle_size: 5,
44
+ seed: 3_141_592
45
+ }.freeze
46
+
47
+ def self.digest(data, options = {})
48
+ new(data, options).digest
49
+ end
50
+
51
+ def initialize(data = [], options = {})
52
+ @options = DEFAULT_OPTIONS.merge(options)
53
+ @data = parse(data)
54
+ end
55
+
56
+ def update(element)
57
+ @data << element
58
+ end
59
+
60
+ def digest
61
+ raise ArgumentError, "input must not be empty" if @data.empty?
62
+
63
+ self.class.__hash(@data, @options[:hash_size], @options[:seed])
64
+ end
65
+
66
+ private
67
+
68
+ def parse(data)
69
+ return [] if data.empty?
70
+
71
+ if data.respond_to?(:to_a)
72
+ data
73
+ elsif data.respond_to?(:to_str)
74
+ self.class.__tokenize(data.to_str, @options[:shingle_size])
75
+ else
76
+ raise ArgumentError, "input must be a string or array of integers"
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Minwise
4
+ VERSION = "0.1.0"
5
+ end
data/lib/minwise.rb ADDED
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "minwise/version"
4
+ require_relative "minwise/minwise"
5
+ require_relative "minwise/minhash"
6
+
7
+ # A Ruby library for generating minwise hashes.
8
+ module Minwise
9
+ class Error < StandardError; end
10
+
11
+ class << self
12
+ # Returns the Jaccard similarity of 2 arrays, a number between 0.0 and 1.0.
13
+ #
14
+ # The arrays are treated as sets, i.e. duplicate elements in an array are
15
+ # only counted once.
16
+ def similarity(set_one, set_two)
17
+ set_one.intersection(set_two).length / set_one.union(set_two).length.to_f
18
+ end
19
+ end
20
+ end
data/sig/minwise.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Minwise
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: minwise
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Samuel Scully
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-08-17 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email:
15
+ - sbscully@gmail.com
16
+ executables: []
17
+ extensions:
18
+ - ext/minwise/extconf.rb
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rubocop.yml"
22
+ - CHANGELOG.md
23
+ - LICENSE.txt
24
+ - README.md
25
+ - Rakefile
26
+ - ext/minwise/extconf.rb
27
+ - ext/minwise/minwise.c
28
+ - ext/minwise/minwise.h
29
+ - lib/minwise.rb
30
+ - lib/minwise/minhash.rb
31
+ - lib/minwise/version.rb
32
+ - sig/minwise.rbs
33
+ homepage: https://github.com/sbscully/minwise
34
+ licenses:
35
+ - MIT
36
+ metadata:
37
+ rubygems_mfa_required: 'true'
38
+ homepage_uri: https://github.com/sbscully/minwise
39
+ source_code_uri: https://github.com/sbscully/minwise
40
+ changelog_uri: https://github.com/sbscully/minwise/blob/main/CHANGELOG.md
41
+ post_install_message:
42
+ rdoc_options: []
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 2.6.0
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ requirements: []
56
+ rubygems_version: 3.3.7
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Fast locality sensitive hashes using minwise hashing and derivatives.
60
+ test_files: []