wisconsin-benchmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -0,0 +1,151 @@
1
+ module WisconsinBenchmark
2
+ # Array generator
3
+ class ArrayGenerator
4
+ def initialize(size)
5
+ if size <= 1000 then @generator = 279; @prime = 1009
6
+ elsif size <= 10000 then @generator = 2969; @prime = 10007
7
+ elsif size <= 100000 then @generator = 21395; @prime = 100003
8
+ elsif size <= 1000000 then @generator = 2107; @prime = 1000003
9
+ elsif size <= 10000000 then @generator = 211; @prime = 10000019
10
+ elsif size <= 100000000 then @generator = 21; @prime = 100000007
11
+ else
12
+ raise "too many rows requested #{size}"
13
+ end
14
+ @size = size
15
+ end
16
+
17
+ # summary of the object.
18
+ #
19
+ # @return [String] return class name, size and value range of each arrays.
20
+ #
21
+ def inspect
22
+ <<~STR
23
+ <#{self.class} (
24
+ size=#{@size},
25
+ unique1=#{inspect_array(@unique1)},
26
+ unique2=#{inspect_array(@unique2)},
27
+ stringu1=#{inspect_array(@stringu1)},
28
+ stringu2=#{inspect_array(@stringu2)},
29
+ string4=#{inspect_array(@string4)}
30
+ )>
31
+ STR
32
+ end
33
+
34
+ # Create a random/unique record array
35
+ # 0...size in range
36
+ #
37
+ # @return [Numo::UInt32] array of attribute :unique1.
38
+ #
39
+ def unique1
40
+ @unique1 ||= begin
41
+ warn 'Generating unique1'
42
+
43
+ seed = @generator
44
+ ary = @size.times.map do
45
+ seed = rand(seed)
46
+ seed - 1
47
+ end
48
+ Numo::UInt32.new(@size).store(ary)
49
+ end
50
+ end
51
+
52
+ # Create a sequential record array as 0...size.
53
+ #
54
+ # @return [Numo::UInt32] array of attribute :unique2.
55
+ #
56
+ def unique2
57
+ @unique2 ||= begin
58
+ warn 'Generating unique2'
59
+
60
+ Numo::UInt32.new(@size).seq
61
+ end
62
+ end
63
+
64
+ # Create a randomly distributed string array, stringu1.
65
+ #
66
+ # @return [Array<String>]
67
+ # array of generated randomly distributed distinct strings.
68
+ # - Each string is 52 bytes.
69
+ # - string[0..6] is converted string from numbers in randomly distributed unique1.
70
+ # The numbers are converted to n-adic string mapped for 'A-Z'.
71
+ # {1 => 'AAAAAAA', 2 => 'AAAAAAB', ... 26 => 'AAAAAAZ', 27 => 'AAAAABA', ...}
72
+ # - string[7..] is 'x' * 45
73
+ # @example
74
+ # ['AAAAAFRxxxxx ... ', 'AAAABJVxxxxx ... ', 'AAAABBMxxxxx ... ', ... ]
75
+ #
76
+ def stringu1
77
+ @stringu1 ||= begin
78
+ warn 'Generating stringu1'
79
+
80
+ trailer = 'x' * 45
81
+ a = unique1.to_a.map do |i|
82
+ str = i.to_s(26).tr('0-9a-p', 'A-Z')
83
+ ('A' * (7 - str.size)) << str << trailer
84
+ end
85
+ Arrow::StringArray.new(a)
86
+ end
87
+ end
88
+
89
+ # Create a sequencial string array, stringu2.
90
+ #
91
+ # @return [Array<String>]
92
+ # sequential array of strings.
93
+ # - Each string is 52 bytes.
94
+ # - string[0..6] is sequential string consists of 'A'..'Z' started from 'AAAAAAA'.
95
+ # - string[7..] is 'x' * 45
96
+ # @example
97
+ # ['AAAAAAAxxxxx ... ', 'AAAAAABxxxxx ... ', 'AAAAAACxxxxx ... ', ... ]
98
+ #
99
+ def stringu2
100
+ @stringu2 ||= begin
101
+ warn 'Generating stringu2'
102
+
103
+ trailer = 'x' * 45
104
+ a = (('A' * 7)..).take(@size).map { _1 << trailer }
105
+ Arrow::StringArray.new(a)
106
+ end
107
+ end
108
+
109
+ # Create a cyclic repeated string array, string4.
110
+ #
111
+ # @return [Array<String>]
112
+ # cyclic repeating array of strings.
113
+ # - Each string is 52 bytes.
114
+ # - string[0..3] is one of ['AAAA', 'HHHH', 'OOOO', 'VVVV'].
115
+ # - string[7..] is 'x' * 48
116
+ # - four strings are repeated.
117
+ # @example
118
+ # ['AAAAxxxxx ... ', 'HHHHxxxxx ... ', 'OOOOxxxxx ... ', 'VVVVxxxxx ...', ... ]
119
+ #
120
+ def string4
121
+ @string4 ||= begin
122
+ warn 'Generating string4'
123
+
124
+ trailer = 'x' * 48
125
+ array = %w[AAAA HHHH OOOO VVVV].map { _1 << trailer }
126
+ a = @size.times.map { |i| array[i % 4] }
127
+ Arrow::StringArray.new(a)
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ # Get pseudo-random integers by a linear congruential generator.
134
+ def rand(seed)
135
+ loop do
136
+ seed = (@generator * seed) % @prime
137
+ break if seed <= @size
138
+ end
139
+ seed
140
+ end
141
+
142
+ def inspect_array(array)
143
+ if array.nil?
144
+ 'nil'
145
+ else
146
+ s, e = array.minmax
147
+ "#{s}..#{e}"
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,49 @@
1
+ module WisconsinBenchmark
2
+ # Arrow::Table generator
3
+ class TableGenerator
4
+ # Create Scaled Wisconsin Benchmark dataset object.
5
+ #
6
+ # @return size [Integer] number of tuples.
7
+ #
8
+ def initialize(size)
9
+ @size = size
10
+ @array = WisconsinBenchmark::ArrayGenerator.new(size)
11
+ end
12
+
13
+ attr_reader :table, :size, :array
14
+
15
+ # Generate Scaled Wisconsin Benchmark dataset in Arrow::Table.
16
+ #
17
+ # @return [Arrow::Table] generated dataset in Arrow::Table.
18
+ #
19
+ def generate
20
+ unique1 = @array.unique1
21
+ onePercent = unique1 % 100
22
+
23
+ @table = Arrow::Table.new(
24
+ [
25
+ [:unique1, unique1],
26
+ [:unique2, @array.unique2],
27
+ [:two, unique1 % 2],
28
+ [:four, unique1 % 4],
29
+ [:ten, unique1 % 10],
30
+ [:twenty, unique1 % 20],
31
+ [:onePercent, onePercent],
32
+ [:tenPercent, unique1 % 10],
33
+ [:twentyPercent, unique1 % 5],
34
+ [:fiftyPercent, unique1 % 2],
35
+ [:unique3, unique1],
36
+ [:evenOnePercent, onePercent * 2],
37
+ [:oddOnePercent, (onePercent * 2) + 1],
38
+ [:stringu1, @array.stringu1],
39
+ [:stringu2, @array.stringu2],
40
+ [:string4, @array.string4],
41
+ ]
42
+ )
43
+ end
44
+
45
+ def inspect
46
+ "<#{self.class} (size=#{@size}, table=#{@table ? '#<Arrow::Table>' : 'nil'})>"
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WisconsinBenchmark
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'arrow-numo-narray'
4
+ require 'parquet'
5
+
6
+ require_relative 'wisconsin-benchmark/array_generator'
7
+ require_relative 'wisconsin-benchmark/table_generator'
8
+ require_relative 'wisconsin-benchmark/version'
9
+
10
+ module WisconsinBenchmark
11
+ class Error < StandardError; end
12
+ end
@@ -0,0 +1,6 @@
1
+ module Wisconsin
2
+ module Benchmark
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/wisconsin-benchmark/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'wisconsin-benchmark'
7
+ spec.version = WisconsinBenchmark::VERSION
8
+ spec.authors = ['Hirokazu SUZUKI (heronshoes)']
9
+ spec.email = ['heronshoes877@gmail.com']
10
+
11
+ spec.summary = 'Wisconsin Benchmark dataset generator.'
12
+ spec.description = 'Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.'
13
+ spec.homepage = 'https://github.com/heronshoes/wisconsin-benchmark'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = '>= 3.0.0'
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark'
19
+ spec.metadata['changelog_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md'
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
+ end
27
+ end
28
+ spec.bindir = 'exe'
29
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_dependency 'red-arrow', '~> 10.0.0'
33
+ spec.add_dependency 'red-arrow-numo-narray'
34
+ spec.add_dependency 'red-parquet'
35
+
36
+ # Development dependency has gone to the Gemfile (rubygems/bundler#7237)
37
+
38
+ spec.metadata['rubygems_mfa_required'] = 'true'
39
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wisconsin-benchmark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Hirokazu SUZUKI (heronshoes)
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: red-arrow
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 10.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 10.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: red-arrow-numo-narray
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: red-parquet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.
56
+ email:
57
+ - heronshoes877@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".rubocop.yml"
63
+ - CHANGELOG.md
64
+ - CODE_OF_CONDUCT.md
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - datasets/WB_1E2.arrow
70
+ - datasets/WB_1E2.csv
71
+ - datasets/WB_1E2.parquet
72
+ - datasets/WB_1E3.arrow
73
+ - datasets/WB_1E3.csv
74
+ - datasets/WB_1E3.parquet
75
+ - datasets/WB_1E4.arrow
76
+ - datasets/WB_1E4.csv
77
+ - datasets/WB_1E4.parquet
78
+ - lib/wisconsin-benchmark.rb
79
+ - lib/wisconsin-benchmark/array_generator.rb
80
+ - lib/wisconsin-benchmark/table_generator.rb
81
+ - lib/wisconsin-benchmark/version.rb
82
+ - sig/wisconsin/benchmark.rbs
83
+ - wisconsin-benchmark.gemspec
84
+ homepage: https://github.com/heronshoes/wisconsin-benchmark
85
+ licenses:
86
+ - MIT
87
+ metadata:
88
+ homepage_uri: https://github.com/heronshoes/wisconsin-benchmark
89
+ source_code_uri: https://github.com/heronshoes/wisconsin-benchmark
90
+ changelog_uri: https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md
91
+ rubygems_mfa_required: 'true'
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: 3.0.0
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubygems_version: 3.4.1
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: Wisconsin Benchmark dataset generator.
111
+ test_files: []