wisconsin-benchmark 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +21 -0
- data/README.md +78 -0
- data/Rakefile +16 -0
- data/datasets/WB_1E2.arrow +0 -0
- data/datasets/WB_1E2.csv +101 -0
- data/datasets/WB_1E2.parquet +0 -0
- data/datasets/WB_1E3.arrow +0 -0
- data/datasets/WB_1E3.csv +1001 -0
- data/datasets/WB_1E3.parquet +0 -0
- data/datasets/WB_1E4.arrow +0 -0
- data/datasets/WB_1E4.csv +10001 -0
- data/datasets/WB_1E4.parquet +0 -0
- data/lib/wisconsin-benchmark/array_generator.rb +151 -0
- data/lib/wisconsin-benchmark/table_generator.rb +49 -0
- data/lib/wisconsin-benchmark/version.rb +5 -0
- data/lib/wisconsin-benchmark.rb +12 -0
- data/sig/wisconsin/benchmark.rbs +6 -0
- data/wisconsin-benchmark.gemspec +39 -0
- metadata +111 -0
Binary file
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module WisconsinBenchmark
|
2
|
+
# Array generator
|
3
|
+
class ArrayGenerator
|
4
|
+
def initialize(size)
|
5
|
+
if size <= 1000 then @generator = 279; @prime = 1009
|
6
|
+
elsif size <= 10000 then @generator = 2969; @prime = 10007
|
7
|
+
elsif size <= 100000 then @generator = 21395; @prime = 100003
|
8
|
+
elsif size <= 1000000 then @generator = 2107; @prime = 1000003
|
9
|
+
elsif size <= 10000000 then @generator = 211; @prime = 10000019
|
10
|
+
elsif size <= 100000000 then @generator = 21; @prime = 100000007
|
11
|
+
else
|
12
|
+
raise "too many rows requested #{size}"
|
13
|
+
end
|
14
|
+
@size = size
|
15
|
+
end
|
16
|
+
|
17
|
+
# summary of the object.
|
18
|
+
#
|
19
|
+
# @return [String] return class name, size and value range of each arrays.
|
20
|
+
#
|
21
|
+
def inspect
|
22
|
+
<<~STR
|
23
|
+
<#{self.class} (
|
24
|
+
size=#{@size},
|
25
|
+
unique1=#{inspect_array(@unique1)},
|
26
|
+
unique2=#{inspect_array(@unique2)},
|
27
|
+
stringu1=#{inspect_array(@stringu1)},
|
28
|
+
stringu2=#{inspect_array(@stringu2)},
|
29
|
+
string4=#{inspect_array(@string4)}
|
30
|
+
)>
|
31
|
+
STR
|
32
|
+
end
|
33
|
+
|
34
|
+
# Create a random/unique record array
|
35
|
+
# 0...size in range
|
36
|
+
#
|
37
|
+
# @return [Numo::UInt32] array of attribute :unique1.
|
38
|
+
#
|
39
|
+
def unique1
|
40
|
+
@unique1 ||= begin
|
41
|
+
warn 'Generating unique1'
|
42
|
+
|
43
|
+
seed = @generator
|
44
|
+
ary = @size.times.map do
|
45
|
+
seed = rand(seed)
|
46
|
+
seed - 1
|
47
|
+
end
|
48
|
+
Numo::UInt32.new(@size).store(ary)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create a sequential record array as 0...size.
|
53
|
+
#
|
54
|
+
# @return [Numo::UInt32] array of attribute :unique2.
|
55
|
+
#
|
56
|
+
def unique2
|
57
|
+
@unique2 ||= begin
|
58
|
+
warn 'Generating unique2'
|
59
|
+
|
60
|
+
Numo::UInt32.new(@size).seq
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Create a randomly distributed string array, stringu1.
|
65
|
+
#
|
66
|
+
# @return [Array<String>]
|
67
|
+
# array of generated randomly distributed distinct strings.
|
68
|
+
# - Each string is 52 bytes.
|
69
|
+
# - string[0..6] is converted string from numbers in randomly distributed unique1.
|
70
|
+
# The numbers are converted to n-adic string mapped for 'A-Z'.
|
71
|
+
# {1 => 'AAAAAAA', 2 => 'AAAAAAB', ... 26 => 'AAAAAAZ', 27 => 'AAAAABA', ...}
|
72
|
+
# - string[7..] is 'x' * 45
|
73
|
+
# @example
|
74
|
+
# ['AAAAAFRxxxxx ... ', 'AAAABJVxxxxx ... ', 'AAAABBMxxxxx ... ', ... ]
|
75
|
+
#
|
76
|
+
def stringu1
|
77
|
+
@stringu1 ||= begin
|
78
|
+
warn 'Generating stringu1'
|
79
|
+
|
80
|
+
trailer = 'x' * 45
|
81
|
+
a = unique1.to_a.map do |i|
|
82
|
+
str = i.to_s(26).tr('0-9a-p', 'A-Z')
|
83
|
+
('A' * (7 - str.size)) << str << trailer
|
84
|
+
end
|
85
|
+
Arrow::StringArray.new(a)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Create a sequencial string array, stringu2.
|
90
|
+
#
|
91
|
+
# @return [Array<String>]
|
92
|
+
# sequential array of strings.
|
93
|
+
# - Each string is 52 bytes.
|
94
|
+
# - string[0..6] is sequential string consists of 'A'..'Z' started from 'AAAAAAA'.
|
95
|
+
# - string[7..] is 'x' * 45
|
96
|
+
# @example
|
97
|
+
# ['AAAAAAAxxxxx ... ', 'AAAAAABxxxxx ... ', 'AAAAAACxxxxx ... ', ... ]
|
98
|
+
#
|
99
|
+
def stringu2
|
100
|
+
@stringu2 ||= begin
|
101
|
+
warn 'Generating stringu2'
|
102
|
+
|
103
|
+
trailer = 'x' * 45
|
104
|
+
a = (('A' * 7)..).take(@size).map { _1 << trailer }
|
105
|
+
Arrow::StringArray.new(a)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Create a cyclic repeated string array, string4.
|
110
|
+
#
|
111
|
+
# @return [Array<String>]
|
112
|
+
# cyclic repeating array of strings.
|
113
|
+
# - Each string is 52 bytes.
|
114
|
+
# - string[0..3] is one of ['AAAA', 'HHHH', 'OOOO', 'VVVV'].
|
115
|
+
# - string[7..] is 'x' * 48
|
116
|
+
# - four strings are repeated.
|
117
|
+
# @example
|
118
|
+
# ['AAAAxxxxx ... ', 'HHHHxxxxx ... ', 'OOOOxxxxx ... ', 'VVVVxxxxx ...', ... ]
|
119
|
+
#
|
120
|
+
def string4
|
121
|
+
@string4 ||= begin
|
122
|
+
warn 'Generating string4'
|
123
|
+
|
124
|
+
trailer = 'x' * 48
|
125
|
+
array = %w[AAAA HHHH OOOO VVVV].map { _1 << trailer }
|
126
|
+
a = @size.times.map { |i| array[i % 4] }
|
127
|
+
Arrow::StringArray.new(a)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
# Get pseudo-random integers by a linear congruential generator.
|
134
|
+
def rand(seed)
|
135
|
+
loop do
|
136
|
+
seed = (@generator * seed) % @prime
|
137
|
+
break if seed <= @size
|
138
|
+
end
|
139
|
+
seed
|
140
|
+
end
|
141
|
+
|
142
|
+
def inspect_array(array)
|
143
|
+
if array.nil?
|
144
|
+
'nil'
|
145
|
+
else
|
146
|
+
s, e = array.minmax
|
147
|
+
"#{s}..#{e}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module WisconsinBenchmark
|
2
|
+
# Arrow::Table generator
|
3
|
+
class TableGenerator
|
4
|
+
# Create Scaled Wisconsin Benchmark dataset object.
|
5
|
+
#
|
6
|
+
# @return size [Integer] number of tuples.
|
7
|
+
#
|
8
|
+
def initialize(size)
|
9
|
+
@size = size
|
10
|
+
@array = WisconsinBenchmark::ArrayGenerator.new(size)
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :table, :size, :array
|
14
|
+
|
15
|
+
# Generate Scaled Wisconsin Benchmark dataset in Arrow::Table.
|
16
|
+
#
|
17
|
+
# @return [Arrow::Table] generated dataset in Arrow::Table.
|
18
|
+
#
|
19
|
+
def generate
|
20
|
+
unique1 = @array.unique1
|
21
|
+
onePercent = unique1 % 100
|
22
|
+
|
23
|
+
@table = Arrow::Table.new(
|
24
|
+
[
|
25
|
+
[:unique1, unique1],
|
26
|
+
[:unique2, @array.unique2],
|
27
|
+
[:two, unique1 % 2],
|
28
|
+
[:four, unique1 % 4],
|
29
|
+
[:ten, unique1 % 10],
|
30
|
+
[:twenty, unique1 % 20],
|
31
|
+
[:onePercent, onePercent],
|
32
|
+
[:tenPercent, unique1 % 10],
|
33
|
+
[:twentyPercent, unique1 % 5],
|
34
|
+
[:fiftyPercent, unique1 % 2],
|
35
|
+
[:unique3, unique1],
|
36
|
+
[:evenOnePercent, onePercent * 2],
|
37
|
+
[:oddOnePercent, (onePercent * 2) + 1],
|
38
|
+
[:stringu1, @array.stringu1],
|
39
|
+
[:stringu2, @array.stringu2],
|
40
|
+
[:string4, @array.string4],
|
41
|
+
]
|
42
|
+
)
|
43
|
+
end
|
44
|
+
|
45
|
+
def inspect
|
46
|
+
"<#{self.class} (size=#{@size}, table=#{@table ? '#<Arrow::Table>' : 'nil'})>"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'arrow-numo-narray'
|
4
|
+
require 'parquet'
|
5
|
+
|
6
|
+
require_relative 'wisconsin-benchmark/array_generator'
|
7
|
+
require_relative 'wisconsin-benchmark/table_generator'
|
8
|
+
require_relative 'wisconsin-benchmark/version'
|
9
|
+
|
10
|
+
module WisconsinBenchmark
|
11
|
+
class Error < StandardError; end
|
12
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/wisconsin-benchmark/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'wisconsin-benchmark'
|
7
|
+
spec.version = WisconsinBenchmark::VERSION
|
8
|
+
spec.authors = ['Hirokazu SUZUKI (heronshoes)']
|
9
|
+
spec.email = ['heronshoes877@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = 'Wisconsin Benchmark dataset generator.'
|
12
|
+
spec.description = 'Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.'
|
13
|
+
spec.homepage = 'https://github.com/heronshoes/wisconsin-benchmark'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
spec.required_ruby_version = '>= 3.0.0'
|
16
|
+
|
17
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
18
|
+
spec.metadata['source_code_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark'
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md'
|
20
|
+
|
21
|
+
# Specify which files should be added to the gem when it is released.
|
22
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
23
|
+
spec.files = Dir.chdir(__dir__) do
|
24
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
25
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
26
|
+
end
|
27
|
+
end
|
28
|
+
spec.bindir = 'exe'
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ['lib']
|
31
|
+
|
32
|
+
spec.add_dependency 'red-arrow', '~> 10.0.0'
|
33
|
+
spec.add_dependency 'red-arrow-numo-narray'
|
34
|
+
spec.add_dependency 'red-parquet'
|
35
|
+
|
36
|
+
# Development dependency has gone to the Gemfile (rubygems/bundler#7237)
|
37
|
+
|
38
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wisconsin-benchmark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Hirokazu SUZUKI (heronshoes)
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-12-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: red-arrow
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 10.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 10.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: red-arrow-numo-narray
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: red-parquet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.
|
56
|
+
email:
|
57
|
+
- heronshoes877@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".rubocop.yml"
|
63
|
+
- CHANGELOG.md
|
64
|
+
- CODE_OF_CONDUCT.md
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE.txt
|
67
|
+
- README.md
|
68
|
+
- Rakefile
|
69
|
+
- datasets/WB_1E2.arrow
|
70
|
+
- datasets/WB_1E2.csv
|
71
|
+
- datasets/WB_1E2.parquet
|
72
|
+
- datasets/WB_1E3.arrow
|
73
|
+
- datasets/WB_1E3.csv
|
74
|
+
- datasets/WB_1E3.parquet
|
75
|
+
- datasets/WB_1E4.arrow
|
76
|
+
- datasets/WB_1E4.csv
|
77
|
+
- datasets/WB_1E4.parquet
|
78
|
+
- lib/wisconsin-benchmark.rb
|
79
|
+
- lib/wisconsin-benchmark/array_generator.rb
|
80
|
+
- lib/wisconsin-benchmark/table_generator.rb
|
81
|
+
- lib/wisconsin-benchmark/version.rb
|
82
|
+
- sig/wisconsin/benchmark.rbs
|
83
|
+
- wisconsin-benchmark.gemspec
|
84
|
+
homepage: https://github.com/heronshoes/wisconsin-benchmark
|
85
|
+
licenses:
|
86
|
+
- MIT
|
87
|
+
metadata:
|
88
|
+
homepage_uri: https://github.com/heronshoes/wisconsin-benchmark
|
89
|
+
source_code_uri: https://github.com/heronshoes/wisconsin-benchmark
|
90
|
+
changelog_uri: https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md
|
91
|
+
rubygems_mfa_required: 'true'
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 3.0.0
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubygems_version: 3.4.1
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: Wisconsin Benchmark dataset generator.
|
111
|
+
test_files: []
|