wisconsin-benchmark 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -0,0 +1,151 @@
1
+ module WisconsinBenchmark
2
+ # Array generator
3
+ class ArrayGenerator
4
+ def initialize(size)
5
+ if size <= 1000 then @generator = 279; @prime = 1009
6
+ elsif size <= 10000 then @generator = 2969; @prime = 10007
7
+ elsif size <= 100000 then @generator = 21395; @prime = 100003
8
+ elsif size <= 1000000 then @generator = 2107; @prime = 1000003
9
+ elsif size <= 10000000 then @generator = 211; @prime = 10000019
10
+ elsif size <= 100000000 then @generator = 21; @prime = 100000007
11
+ else
12
+ raise "too many rows requested #{size}"
13
+ end
14
+ @size = size
15
+ end
16
+
17
+ # summary of the object.
18
+ #
19
+ # @return [String] return class name, size and value range of each arrays.
20
+ #
21
+ def inspect
22
+ <<~STR
23
+ <#{self.class} (
24
+ size=#{@size},
25
+ unique1=#{inspect_array(@unique1)},
26
+ unique2=#{inspect_array(@unique2)},
27
+ stringu1=#{inspect_array(@stringu1)},
28
+ stringu2=#{inspect_array(@stringu2)},
29
+ string4=#{inspect_array(@string4)}
30
+ )>
31
+ STR
32
+ end
33
+
34
+ # Create a random/unique record array
35
+ # 0...size in range
36
+ #
37
+ # @return [Numo::UInt32] array of attribute :unique1.
38
+ #
39
+ def unique1
40
+ @unique1 ||= begin
41
+ warn 'Generating unique1'
42
+
43
+ seed = @generator
44
+ ary = @size.times.map do
45
+ seed = rand(seed)
46
+ seed - 1
47
+ end
48
+ Numo::UInt32.new(@size).store(ary)
49
+ end
50
+ end
51
+
52
+ # Create a sequential record array as 0...size.
53
+ #
54
+ # @return [Numo::UInt32] array of attribute :unique2.
55
+ #
56
+ def unique2
57
+ @unique2 ||= begin
58
+ warn 'Generating unique2'
59
+
60
+ Numo::UInt32.new(@size).seq
61
+ end
62
+ end
63
+
64
+ # Create a randomly distributed string array, stringu1.
65
+ #
66
+ # @return [Array<String>]
67
+ # array of generated randomly distributed distinct strings.
68
+ # - Each string is 52 bytes.
69
+ # - string[0..6] is converted string from numbers in randomly distributed unique1.
70
+ # The numbers are converted to n-adic string mapped for 'A-Z'.
71
+ # {1 => 'AAAAAAA', 2 => 'AAAAAAB', ... 26 => 'AAAAAAZ', 27 => 'AAAAABA', ...}
72
+ # - string[7..] is 'x' * 45
73
+ # @example
74
+ # ['AAAAAFRxxxxx ... ', 'AAAABJVxxxxx ... ', 'AAAABBMxxxxx ... ', ... ]
75
+ #
76
+ def stringu1
77
+ @stringu1 ||= begin
78
+ warn 'Generating stringu1'
79
+
80
+ trailer = 'x' * 45
81
+ a = unique1.to_a.map do |i|
82
+ str = i.to_s(26).tr('0-9a-p', 'A-Z')
83
+ ('A' * (7 - str.size)) << str << trailer
84
+ end
85
+ Arrow::StringArray.new(a)
86
+ end
87
+ end
88
+
89
+ # Create a sequencial string array, stringu2.
90
+ #
91
+ # @return [Array<String>]
92
+ # sequential array of strings.
93
+ # - Each string is 52 bytes.
94
+ # - string[0..6] is sequential string consists of 'A'..'Z' started from 'AAAAAAA'.
95
+ # - string[7..] is 'x' * 45
96
+ # @example
97
+ # ['AAAAAAAxxxxx ... ', 'AAAAAABxxxxx ... ', 'AAAAAACxxxxx ... ', ... ]
98
+ #
99
+ def stringu2
100
+ @stringu2 ||= begin
101
+ warn 'Generating stringu2'
102
+
103
+ trailer = 'x' * 45
104
+ a = (('A' * 7)..).take(@size).map { _1 << trailer }
105
+ Arrow::StringArray.new(a)
106
+ end
107
+ end
108
+
109
+ # Create a cyclic repeated string array, string4.
110
+ #
111
+ # @return [Array<String>]
112
+ # cyclic repeating array of strings.
113
+ # - Each string is 52 bytes.
114
+ # - string[0..3] is one of ['AAAA', 'HHHH', 'OOOO', 'VVVV'].
115
+ # - string[7..] is 'x' * 48
116
+ # - four strings are repeated.
117
+ # @example
118
+ # ['AAAAxxxxx ... ', 'HHHHxxxxx ... ', 'OOOOxxxxx ... ', 'VVVVxxxxx ...', ... ]
119
+ #
120
+ def string4
121
+ @string4 ||= begin
122
+ warn 'Generating string4'
123
+
124
+ trailer = 'x' * 48
125
+ array = %w[AAAA HHHH OOOO VVVV].map { _1 << trailer }
126
+ a = @size.times.map { |i| array[i % 4] }
127
+ Arrow::StringArray.new(a)
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ # Get pseudo-random integers by a linear congruential generator.
134
+ def rand(seed)
135
+ loop do
136
+ seed = (@generator * seed) % @prime
137
+ break if seed <= @size
138
+ end
139
+ seed
140
+ end
141
+
142
+ def inspect_array(array)
143
+ if array.nil?
144
+ 'nil'
145
+ else
146
+ s, e = array.minmax
147
+ "#{s}..#{e}"
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,49 @@
1
+ module WisconsinBenchmark
2
+ # Arrow::Table generator
3
+ class TableGenerator
4
+ # Create Scaled Wisconsin Benchmark dataset object.
5
+ #
6
+ # @return size [Integer] number of tuples.
7
+ #
8
+ def initialize(size)
9
+ @size = size
10
+ @array = WisconsinBenchmark::ArrayGenerator.new(size)
11
+ end
12
+
13
+ attr_reader :table, :size, :array
14
+
15
+ # Generate Scaled Wisconsin Benchmark dataset in Arrow::Table.
16
+ #
17
+ # @return [Arrow::Table] generated dataset in Arrow::Table.
18
+ #
19
+ def generate
20
+ unique1 = @array.unique1
21
+ onePercent = unique1 % 100
22
+
23
+ @table = Arrow::Table.new(
24
+ [
25
+ [:unique1, unique1],
26
+ [:unique2, @array.unique2],
27
+ [:two, unique1 % 2],
28
+ [:four, unique1 % 4],
29
+ [:ten, unique1 % 10],
30
+ [:twenty, unique1 % 20],
31
+ [:onePercent, onePercent],
32
+ [:tenPercent, unique1 % 10],
33
+ [:twentyPercent, unique1 % 5],
34
+ [:fiftyPercent, unique1 % 2],
35
+ [:unique3, unique1],
36
+ [:evenOnePercent, onePercent * 2],
37
+ [:oddOnePercent, (onePercent * 2) + 1],
38
+ [:stringu1, @array.stringu1],
39
+ [:stringu2, @array.stringu2],
40
+ [:string4, @array.string4],
41
+ ]
42
+ )
43
+ end
44
+
45
+ def inspect
46
+ "<#{self.class} (size=#{@size}, table=#{@table ? '#<Arrow::Table>' : 'nil'})>"
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WisconsinBenchmark
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'arrow-numo-narray'
4
+ require 'parquet'
5
+
6
+ require_relative 'wisconsin-benchmark/array_generator'
7
+ require_relative 'wisconsin-benchmark/table_generator'
8
+ require_relative 'wisconsin-benchmark/version'
9
+
10
+ module WisconsinBenchmark
11
+ class Error < StandardError; end
12
+ end
@@ -0,0 +1,6 @@
1
+ module Wisconsin
2
+ module Benchmark
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/wisconsin-benchmark/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'wisconsin-benchmark'
7
+ spec.version = WisconsinBenchmark::VERSION
8
+ spec.authors = ['Hirokazu SUZUKI (heronshoes)']
9
+ spec.email = ['heronshoes877@gmail.com']
10
+
11
+ spec.summary = 'Wisconsin Benchmark dataset generator.'
12
+ spec.description = 'Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.'
13
+ spec.homepage = 'https://github.com/heronshoes/wisconsin-benchmark'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = '>= 3.0.0'
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark'
19
+ spec.metadata['changelog_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md'
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
+ end
27
+ end
28
+ spec.bindir = 'exe'
29
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_dependency 'red-arrow', '~> 10.0.0'
33
+ spec.add_dependency 'red-arrow-numo-narray'
34
+ spec.add_dependency 'red-parquet'
35
+
36
+ # Development dependency has gone to the Gemfile (rubygems/bundler#7237)
37
+
38
+ spec.metadata['rubygems_mfa_required'] = 'true'
39
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wisconsin-benchmark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Hirokazu SUZUKI (heronshoes)
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: red-arrow
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 10.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 10.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: red-arrow-numo-narray
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: red-parquet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.
56
+ email:
57
+ - heronshoes877@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".rubocop.yml"
63
+ - CHANGELOG.md
64
+ - CODE_OF_CONDUCT.md
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - datasets/WB_1E2.arrow
70
+ - datasets/WB_1E2.csv
71
+ - datasets/WB_1E2.parquet
72
+ - datasets/WB_1E3.arrow
73
+ - datasets/WB_1E3.csv
74
+ - datasets/WB_1E3.parquet
75
+ - datasets/WB_1E4.arrow
76
+ - datasets/WB_1E4.csv
77
+ - datasets/WB_1E4.parquet
78
+ - lib/wisconsin-benchmark.rb
79
+ - lib/wisconsin-benchmark/array_generator.rb
80
+ - lib/wisconsin-benchmark/table_generator.rb
81
+ - lib/wisconsin-benchmark/version.rb
82
+ - sig/wisconsin/benchmark.rbs
83
+ - wisconsin-benchmark.gemspec
84
+ homepage: https://github.com/heronshoes/wisconsin-benchmark
85
+ licenses:
86
+ - MIT
87
+ metadata:
88
+ homepage_uri: https://github.com/heronshoes/wisconsin-benchmark
89
+ source_code_uri: https://github.com/heronshoes/wisconsin-benchmark
90
+ changelog_uri: https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md
91
+ rubygems_mfa_required: 'true'
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: 3.0.0
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubygems_version: 3.4.1
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: Wisconsin Benchmark dataset generator.
111
+ test_files: []