wisconsin-benchmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +21 -0
- data/README.md +78 -0
- data/Rakefile +16 -0
- data/datasets/WB_1E2.arrow +0 -0
- data/datasets/WB_1E2.csv +101 -0
- data/datasets/WB_1E2.parquet +0 -0
- data/datasets/WB_1E3.arrow +0 -0
- data/datasets/WB_1E3.csv +1001 -0
- data/datasets/WB_1E3.parquet +0 -0
- data/datasets/WB_1E4.arrow +0 -0
- data/datasets/WB_1E4.csv +10001 -0
- data/datasets/WB_1E4.parquet +0 -0
- data/lib/wisconsin-benchmark/array_generator.rb +151 -0
- data/lib/wisconsin-benchmark/table_generator.rb +49 -0
- data/lib/wisconsin-benchmark/version.rb +5 -0
- data/lib/wisconsin-benchmark.rb +12 -0
- data/sig/wisconsin/benchmark.rbs +6 -0
- data/wisconsin-benchmark.gemspec +39 -0
- metadata +111 -0
Binary file
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module WisconsinBenchmark
|
2
|
+
# Array generator
|
3
|
+
class ArrayGenerator
|
4
|
+
def initialize(size)
|
5
|
+
if size <= 1000 then @generator = 279; @prime = 1009
|
6
|
+
elsif size <= 10000 then @generator = 2969; @prime = 10007
|
7
|
+
elsif size <= 100000 then @generator = 21395; @prime = 100003
|
8
|
+
elsif size <= 1000000 then @generator = 2107; @prime = 1000003
|
9
|
+
elsif size <= 10000000 then @generator = 211; @prime = 10000019
|
10
|
+
elsif size <= 100000000 then @generator = 21; @prime = 100000007
|
11
|
+
else
|
12
|
+
raise "too many rows requested #{size}"
|
13
|
+
end
|
14
|
+
@size = size
|
15
|
+
end
|
16
|
+
|
17
|
+
# summary of the object.
|
18
|
+
#
|
19
|
+
# @return [String] return class name, size and value range of each arrays.
|
20
|
+
#
|
21
|
+
def inspect
|
22
|
+
<<~STR
|
23
|
+
<#{self.class} (
|
24
|
+
size=#{@size},
|
25
|
+
unique1=#{inspect_array(@unique1)},
|
26
|
+
unique2=#{inspect_array(@unique2)},
|
27
|
+
stringu1=#{inspect_array(@stringu1)},
|
28
|
+
stringu2=#{inspect_array(@stringu2)},
|
29
|
+
string4=#{inspect_array(@string4)}
|
30
|
+
)>
|
31
|
+
STR
|
32
|
+
end
|
33
|
+
|
34
|
+
# Create a random/unique record array
|
35
|
+
# 0...size in range
|
36
|
+
#
|
37
|
+
# @return [Numo::UInt32] array of attribute :unique1.
|
38
|
+
#
|
39
|
+
def unique1
|
40
|
+
@unique1 ||= begin
|
41
|
+
warn 'Generating unique1'
|
42
|
+
|
43
|
+
seed = @generator
|
44
|
+
ary = @size.times.map do
|
45
|
+
seed = rand(seed)
|
46
|
+
seed - 1
|
47
|
+
end
|
48
|
+
Numo::UInt32.new(@size).store(ary)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create a sequential record array as 0...size.
|
53
|
+
#
|
54
|
+
# @return [Numo::UInt32] array of attribute :unique2.
|
55
|
+
#
|
56
|
+
def unique2
|
57
|
+
@unique2 ||= begin
|
58
|
+
warn 'Generating unique2'
|
59
|
+
|
60
|
+
Numo::UInt32.new(@size).seq
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Create a randomly distributed string array, stringu1.
|
65
|
+
#
|
66
|
+
# @return [Array<String>]
|
67
|
+
# array of generated randomly distributed distinct strings.
|
68
|
+
# - Each string is 52 bytes.
|
69
|
+
# - string[0..6] is converted string from numbers in randomly distributed unique1.
|
70
|
+
# The numbers are converted to n-adic string mapped for 'A-Z'.
|
71
|
+
# {1 => 'AAAAAAA', 2 => 'AAAAAAB', ... 26 => 'AAAAAAZ', 27 => 'AAAAABA', ...}
|
72
|
+
# - string[7..] is 'x' * 45
|
73
|
+
# @example
|
74
|
+
# ['AAAAAFRxxxxx ... ', 'AAAABJVxxxxx ... ', 'AAAABBMxxxxx ... ', ... ]
|
75
|
+
#
|
76
|
+
def stringu1
|
77
|
+
@stringu1 ||= begin
|
78
|
+
warn 'Generating stringu1'
|
79
|
+
|
80
|
+
trailer = 'x' * 45
|
81
|
+
a = unique1.to_a.map do |i|
|
82
|
+
str = i.to_s(26).tr('0-9a-p', 'A-Z')
|
83
|
+
('A' * (7 - str.size)) << str << trailer
|
84
|
+
end
|
85
|
+
Arrow::StringArray.new(a)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Create a sequencial string array, stringu2.
|
90
|
+
#
|
91
|
+
# @return [Array<String>]
|
92
|
+
# sequential array of strings.
|
93
|
+
# - Each string is 52 bytes.
|
94
|
+
# - string[0..6] is sequential string consists of 'A'..'Z' started from 'AAAAAAA'.
|
95
|
+
# - string[7..] is 'x' * 45
|
96
|
+
# @example
|
97
|
+
# ['AAAAAAAxxxxx ... ', 'AAAAAABxxxxx ... ', 'AAAAAACxxxxx ... ', ... ]
|
98
|
+
#
|
99
|
+
def stringu2
|
100
|
+
@stringu2 ||= begin
|
101
|
+
warn 'Generating stringu2'
|
102
|
+
|
103
|
+
trailer = 'x' * 45
|
104
|
+
a = (('A' * 7)..).take(@size).map { _1 << trailer }
|
105
|
+
Arrow::StringArray.new(a)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Create a cyclic repeated string array, string4.
|
110
|
+
#
|
111
|
+
# @return [Array<String>]
|
112
|
+
# cyclic repeating array of strings.
|
113
|
+
# - Each string is 52 bytes.
|
114
|
+
# - string[0..3] is one of ['AAAA', 'HHHH', 'OOOO', 'VVVV'].
|
115
|
+
# - string[7..] is 'x' * 48
|
116
|
+
# - four strings are repeated.
|
117
|
+
# @example
|
118
|
+
# ['AAAAxxxxx ... ', 'HHHHxxxxx ... ', 'OOOOxxxxx ... ', 'VVVVxxxxx ...', ... ]
|
119
|
+
#
|
120
|
+
def string4
|
121
|
+
@string4 ||= begin
|
122
|
+
warn 'Generating string4'
|
123
|
+
|
124
|
+
trailer = 'x' * 48
|
125
|
+
array = %w[AAAA HHHH OOOO VVVV].map { _1 << trailer }
|
126
|
+
a = @size.times.map { |i| array[i % 4] }
|
127
|
+
Arrow::StringArray.new(a)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
# Get pseudo-random integers by a linear congruential generator.
|
134
|
+
def rand(seed)
|
135
|
+
loop do
|
136
|
+
seed = (@generator * seed) % @prime
|
137
|
+
break if seed <= @size
|
138
|
+
end
|
139
|
+
seed
|
140
|
+
end
|
141
|
+
|
142
|
+
def inspect_array(array)
|
143
|
+
if array.nil?
|
144
|
+
'nil'
|
145
|
+
else
|
146
|
+
s, e = array.minmax
|
147
|
+
"#{s}..#{e}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module WisconsinBenchmark
|
2
|
+
# Arrow::Table generator
|
3
|
+
class TableGenerator
|
4
|
+
# Create Scaled Wisconsin Benchmark dataset object.
|
5
|
+
#
|
6
|
+
# @return size [Integer] number of tuples.
|
7
|
+
#
|
8
|
+
def initialize(size)
|
9
|
+
@size = size
|
10
|
+
@array = WisconsinBenchmark::ArrayGenerator.new(size)
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :table, :size, :array
|
14
|
+
|
15
|
+
# Generate Scaled Wisconsin Benchmark dataset in Arrow::Table.
|
16
|
+
#
|
17
|
+
# @return [Arrow::Table] generated dataset in Arrow::Table.
|
18
|
+
#
|
19
|
+
def generate
|
20
|
+
unique1 = @array.unique1
|
21
|
+
onePercent = unique1 % 100
|
22
|
+
|
23
|
+
@table = Arrow::Table.new(
|
24
|
+
[
|
25
|
+
[:unique1, unique1],
|
26
|
+
[:unique2, @array.unique2],
|
27
|
+
[:two, unique1 % 2],
|
28
|
+
[:four, unique1 % 4],
|
29
|
+
[:ten, unique1 % 10],
|
30
|
+
[:twenty, unique1 % 20],
|
31
|
+
[:onePercent, onePercent],
|
32
|
+
[:tenPercent, unique1 % 10],
|
33
|
+
[:twentyPercent, unique1 % 5],
|
34
|
+
[:fiftyPercent, unique1 % 2],
|
35
|
+
[:unique3, unique1],
|
36
|
+
[:evenOnePercent, onePercent * 2],
|
37
|
+
[:oddOnePercent, (onePercent * 2) + 1],
|
38
|
+
[:stringu1, @array.stringu1],
|
39
|
+
[:stringu2, @array.stringu2],
|
40
|
+
[:string4, @array.string4],
|
41
|
+
]
|
42
|
+
)
|
43
|
+
end
|
44
|
+
|
45
|
+
def inspect
|
46
|
+
"<#{self.class} (size=#{@size}, table=#{@table ? '#<Arrow::Table>' : 'nil'})>"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'arrow-numo-narray'
|
4
|
+
require 'parquet'
|
5
|
+
|
6
|
+
require_relative 'wisconsin-benchmark/array_generator'
|
7
|
+
require_relative 'wisconsin-benchmark/table_generator'
|
8
|
+
require_relative 'wisconsin-benchmark/version'
|
9
|
+
|
10
|
+
module WisconsinBenchmark
|
11
|
+
class Error < StandardError; end
|
12
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/wisconsin-benchmark/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'wisconsin-benchmark'
|
7
|
+
spec.version = WisconsinBenchmark::VERSION
|
8
|
+
spec.authors = ['Hirokazu SUZUKI (heronshoes)']
|
9
|
+
spec.email = ['heronshoes877@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = 'Wisconsin Benchmark dataset generator.'
|
12
|
+
spec.description = 'Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.'
|
13
|
+
spec.homepage = 'https://github.com/heronshoes/wisconsin-benchmark'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
spec.required_ruby_version = '>= 3.0.0'
|
16
|
+
|
17
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
18
|
+
spec.metadata['source_code_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark'
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md'
|
20
|
+
|
21
|
+
# Specify which files should be added to the gem when it is released.
|
22
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
23
|
+
spec.files = Dir.chdir(__dir__) do
|
24
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
25
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
26
|
+
end
|
27
|
+
end
|
28
|
+
spec.bindir = 'exe'
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ['lib']
|
31
|
+
|
32
|
+
spec.add_dependency 'red-arrow', '~> 10.0.0'
|
33
|
+
spec.add_dependency 'red-arrow-numo-narray'
|
34
|
+
spec.add_dependency 'red-parquet'
|
35
|
+
|
36
|
+
# Development dependency has gone to the Gemfile (rubygems/bundler#7237)
|
37
|
+
|
38
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wisconsin-benchmark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Hirokazu SUZUKI (heronshoes)
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-12-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: red-arrow
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 10.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 10.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: red-arrow-numo-narray
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: red-parquet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Scalable Wisconsin Benchmark dataset generator for Arrow/Parquet.
|
56
|
+
email:
|
57
|
+
- heronshoes877@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".rubocop.yml"
|
63
|
+
- CHANGELOG.md
|
64
|
+
- CODE_OF_CONDUCT.md
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE.txt
|
67
|
+
- README.md
|
68
|
+
- Rakefile
|
69
|
+
- datasets/WB_1E2.arrow
|
70
|
+
- datasets/WB_1E2.csv
|
71
|
+
- datasets/WB_1E2.parquet
|
72
|
+
- datasets/WB_1E3.arrow
|
73
|
+
- datasets/WB_1E3.csv
|
74
|
+
- datasets/WB_1E3.parquet
|
75
|
+
- datasets/WB_1E4.arrow
|
76
|
+
- datasets/WB_1E4.csv
|
77
|
+
- datasets/WB_1E4.parquet
|
78
|
+
- lib/wisconsin-benchmark.rb
|
79
|
+
- lib/wisconsin-benchmark/array_generator.rb
|
80
|
+
- lib/wisconsin-benchmark/table_generator.rb
|
81
|
+
- lib/wisconsin-benchmark/version.rb
|
82
|
+
- sig/wisconsin/benchmark.rbs
|
83
|
+
- wisconsin-benchmark.gemspec
|
84
|
+
homepage: https://github.com/heronshoes/wisconsin-benchmark
|
85
|
+
licenses:
|
86
|
+
- MIT
|
87
|
+
metadata:
|
88
|
+
homepage_uri: https://github.com/heronshoes/wisconsin-benchmark
|
89
|
+
source_code_uri: https://github.com/heronshoes/wisconsin-benchmark
|
90
|
+
changelog_uri: https://github.com/heronshoes/wisconsin-benchmark/blob/main/CHANGELOG.md
|
91
|
+
rubygems_mfa_required: 'true'
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 3.0.0
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubygems_version: 3.4.1
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: Wisconsin Benchmark dataset generator.
|
111
|
+
test_files: []
|