mahout 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ipaddr"
4
+
5
+ module Mahout
6
+ class Config
7
+ REQUIRED_KEYS = %w[server instance postgres storage hardening].freeze
8
+ VALID_PROFILES = %w[conservative balanced aggressive].freeze
9
+ SAFE_IDENTIFIER = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
10
+
11
+ attr_reader :data
12
+
13
+ def initialize(path: nil, data: nil, merge: nil, overrides: {})
14
+ @data = if data
15
+ data
16
+ elsif path
17
+ YAML.safe_load_file(path, permitted_classes: [Symbol])
18
+ else
19
+ raise ConfigError, "must provide either path or data"
20
+ end
21
+ deep_merge!(@data, merge) if merge
22
+ merge_overrides(overrides)
23
+ validate
24
+ end
25
+
26
+ def checksum
27
+ Digest::MD5.hexdigest(@data.to_json)
28
+ end
29
+
30
+ def set_secret(key, value)
31
+ @secrets[key] = value
32
+ end
33
+
34
+ def secret(key)
35
+ @secrets[key]
36
+ end
37
+
38
+ def host
39
+ dig("server", "host")
40
+ end
41
+
42
+ def user
43
+ dig("server", "user") || "root"
44
+ end
45
+
46
+ def key
47
+ dig("server", "key") || "~/.ssh/id_rsa"
48
+ end
49
+
50
+ def ssh_options
51
+ opts = dig("server", "ssh_options") || {}
52
+ opts.transform_keys(&:to_sym)
53
+ end
54
+
55
+ def instance_name
56
+ dig("instance", "name")
57
+ end
58
+
59
+ def environment
60
+ dig("instance", "environment") || "production"
61
+ end
62
+
63
+ def pg_version
64
+ dig("postgres", "version") || 17
65
+ end
66
+
67
+ def pg_port
68
+ dig("postgres", "port") || 5432
69
+ end
70
+
71
+ def pg_database
72
+ dig("postgres", "database")
73
+ end
74
+
75
+ def pg_username
76
+ dig("postgres", "username")
77
+ end
78
+
79
+ def profile
80
+ dig("profile") || "balanced"
81
+ end
82
+
83
+ def data_device
84
+ dig("storage", "data_device")
85
+ end
86
+
87
+ def data_mount
88
+ dig("storage", "data_mount")
89
+ end
90
+
91
+ def wal_device
92
+ dig("storage", "wal_device")
93
+ end
94
+
95
+ def wal_mount
96
+ dig("storage", "wal_mount")
97
+ end
98
+
99
+ def pgbouncer_port
100
+ dig("pgbouncer", "port") || 6432
101
+ end
102
+
103
+ def pgbouncer_pool_mode
104
+ dig("pgbouncer", "pool_mode") || "transaction"
105
+ end
106
+
107
+ def pgbackrest_enabled?
108
+ dig("pgbackrest", "enabled") != false && dig("pgbackrest") != nil
109
+ end
110
+
111
+ def pgbackrest_stanza
112
+ dig("pgbackrest", "stanza")
113
+ end
114
+
115
+ def pgbackrest_s3_bucket
116
+ dig("pgbackrest", "s3_bucket")
117
+ end
118
+
119
+ def pgbackrest_s3_endpoint
120
+ endpoint = dig("pgbackrest", "s3_endpoint") || "s3.amazonaws.com"
121
+ endpoint.sub(%r{\Ahttps?://}, "")
122
+ end
123
+
124
+ def pgbackrest_s3_region
125
+ dig("pgbackrest", "s3_region") || "us-east-1"
126
+ end
127
+
128
+ def pgbackrest_s3_uri_style
129
+ dig("pgbackrest", "s3_uri_style") || "host"
130
+ end
131
+
132
+ def pgbackrest_retention_full
133
+ dig("pgbackrest", "retention_full") || 4
134
+ end
135
+
136
+ def pgbackrest_process_max
137
+ dig("pgbackrest", "process_max") || 4
138
+ end
139
+
140
+ def pgbackrest_schedule_full
141
+ dig("pgbackrest", "schedule", "full") || "Sun *-*-* 02:00:00"
142
+ end
143
+
144
+ def pgbackrest_schedule_diff
145
+ dig("pgbackrest", "schedule", "diff") || "Mon..Sat *-*-* 02:00:00"
146
+ end
147
+
148
+ def pgbackrest_schedule_incr
149
+ dig("pgbackrest", "schedule", "incr") || "*-*-* *:30:00"
150
+ end
151
+
152
+ def datadog_enabled?
153
+ dig("datadog", "enabled") == true
154
+ end
155
+
156
+ def datadog_site
157
+ dig("datadog", "site") || "datadoghq.com"
158
+ end
159
+
160
+ def datadog_tags
161
+ dig("datadog", "tags") || []
162
+ end
163
+
164
+ def allowed_cidrs
165
+ dig("hardening", "allowed_cidrs") || []
166
+ end
167
+
168
+ def ssl_required?
169
+ dig("hardening", "ssl_required") != false
170
+ end
171
+
172
+ def ssh_hardening?
173
+ dig("hardening", "ssh_hardening") != false
174
+ end
175
+
176
+ def ssh_password_auth?
177
+ dig("hardening", "ssh_password_auth") == true
178
+ end
179
+
180
+ def ssh_max_auth_tries
181
+ dig("hardening", "ssh_max_auth_tries") || 3
182
+ end
183
+
184
+ def fail2ban?
185
+ dig("hardening", "fail2ban") != false
186
+ end
187
+
188
+ def fail2ban_maxretry
189
+ dig("hardening", "fail2ban_maxretry") || 5
190
+ end
191
+
192
+ def fail2ban_bantime
193
+ dig("hardening", "fail2ban_bantime") || 3600
194
+ end
195
+
196
+ def unattended_upgrades?
197
+ dig("hardening", "unattended_upgrades") != false
198
+ end
199
+
200
+ def ufw?
201
+ dig("hardening", "ufw") != false
202
+ end
203
+
204
+ def templates_path
205
+ path = dig("templates_path")
206
+ return nil unless path
207
+
208
+ File.expand_path(path)
209
+ end
210
+
211
+ def extensions
212
+ dig("extensions") || ["pg_stat_statements"]
213
+ end
214
+
215
+ def postgres_overrides
216
+ dig("postgres", "overrides") || {}
217
+ end
218
+
219
+ def tuner_profile
220
+ dig("tuner", "profile")
221
+ end
222
+
223
+ def pg_data_dir
224
+ "#{data_mount}/#{pg_version}/main"
225
+ end
226
+
227
+ def pg_wal_dir
228
+ return nil unless wal_mount
229
+
230
+ "#{wal_mount}/#{pg_version}/wal"
231
+ end
232
+
233
+ private
234
+
235
+ def deep_merge!(base, overlay)
236
+ overlay.each do |key, value|
237
+ if value.is_a?(Hash) && base[key].is_a?(Hash)
238
+ deep_merge!(base[key], value)
239
+ else
240
+ base[key] = value
241
+ end
242
+ end
243
+ end
244
+
245
+ def dig(*keys)
246
+ @data.dig(*keys)
247
+ end
248
+
249
+ def merge_overrides(overrides)
250
+ @secrets = {}
251
+ load_secrets_file
252
+ overrides.each do |key, value|
253
+ case key.to_s
254
+ when "host"
255
+ @data["server"] ||= {}
256
+ @data["server"]["host"] = value
257
+ when "user"
258
+ @data["server"] ||= {}
259
+ @data["server"]["user"] = value
260
+ when "key"
261
+ @data["server"] ||= {}
262
+ @data["server"]["key"] = value
263
+ end
264
+ end
265
+ end
266
+
267
+ def load_secrets_file
268
+ secrets_path = dig("secrets_file")
269
+ return unless secrets_path
270
+
271
+ path = File.expand_path(secrets_path)
272
+ return unless File.exist?(path)
273
+
274
+ secrets_data = YAML.safe_load_file(path, permitted_classes: [Symbol])
275
+ return unless secrets_data.is_a?(Hash)
276
+
277
+ secrets_data.each { |k, v| @secrets[k.to_s] = v.to_s }
278
+ end
279
+
280
+ def validate
281
+ REQUIRED_KEYS.each do |key|
282
+ raise ConfigError, "missing required config section: #{key}" unless @data[key]
283
+ end
284
+ raise ConfigError, "missing server.host" unless host
285
+ raise ConfigError, "missing postgres.database" unless pg_database
286
+ raise ConfigError, "missing postgres.username" unless pg_username
287
+ raise ConfigError, "missing storage.data_device" unless data_device
288
+ raise ConfigError, "missing storage.data_mount" unless data_mount
289
+ if pgbackrest_enabled?
290
+ raise ConfigError, "missing pgbackrest.stanza" unless pgbackrest_stanza
291
+ raise ConfigError, "missing pgbackrest.s3_bucket" unless pgbackrest_s3_bucket
292
+ end
293
+
294
+ validate_safe_identifier("postgres.database", pg_database)
295
+ validate_safe_identifier("postgres.username", pg_username)
296
+ validate_cidrs
297
+ validate_profile
298
+ end
299
+
300
+ def validate_safe_identifier(label, value)
301
+ return if value.nil?
302
+
303
+ unless value.match?(SAFE_IDENTIFIER)
304
+ raise ConfigError, "#{label} contains unsafe characters: #{value}. only alphanumeric and underscore allowed"
305
+ end
306
+ end
307
+
308
+ def validate_cidrs
309
+ allowed_cidrs.each do |cidr|
310
+ IPAddr.new(cidr)
311
+ rescue IPAddr::InvalidAddressError
312
+ raise ConfigError, "invalid CIDR: #{cidr}"
313
+ end
314
+ end
315
+
316
+ def validate_profile
317
+ return if VALID_PROFILES.include?(profile)
318
+
319
+ raise ConfigError, "unknown profile: #{profile}. must be one of: #{VALID_PROFILES.join(", ")}"
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,172 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mahout
4
+ class DiskBenchmark
5
+ BLOCK_SIZE = "1M"
6
+ COUNT = 1024
7
+ TEST_FILE = "mahout-diskbench"
8
+
9
+ def initialize(runner:, config:)
10
+ @runner = runner
11
+ @config = config
12
+ end
13
+
14
+ def call
15
+ data_mount = @config.data_mount
16
+ wal_mount = @config.wal_mount
17
+
18
+ ensure_fio
19
+
20
+ $stdout.puts("disk benchmark")
21
+
22
+ $stdout.puts("")
23
+ $stdout.puts("sequential throughput (1M blocks):")
24
+ run_throughput_tests("data (#{data_mount})", data_mount)
25
+
26
+ if wal_mount
27
+ $stdout.puts("")
28
+ run_throughput_tests("wal (#{wal_mount})", wal_mount)
29
+ $stdout.puts("")
30
+ run_cross_device(wal_mount, data_mount)
31
+ end
32
+
33
+ $stdout.puts("")
34
+ $stdout.puts("random iops (4k blocks):")
35
+ run_iops_tests("data (#{data_mount})", data_mount)
36
+
37
+ if wal_mount
38
+ $stdout.puts("")
39
+ run_iops_tests("wal (#{wal_mount})", wal_mount)
40
+ end
41
+
42
+ $stdout.puts("")
43
+ $stdout.puts("done")
44
+ end
45
+
46
+ private
47
+
48
+ def ensure_fio
49
+ result = @runner.run("which fio", allow_failure: true)
50
+ return if result.success? || @runner.dry_run?
51
+
52
+ @runner.run("apt-get install -y fio", allow_failure: true)
53
+ end
54
+
55
+ def run_throughput_tests(label, mount)
56
+ $stdout.puts("#{label}:")
57
+ run_write(mount)
58
+ clear_cache
59
+ run_read(mount)
60
+ cleanup(mount)
61
+ end
62
+
63
+ def run_iops_tests(label, mount)
64
+ $stdout.puts("#{label}:")
65
+ file = "#{mount}/#{TEST_FILE}-fio"
66
+
67
+ run_fio(label, mount, "randread", file)
68
+ run_fio(label, mount, "randwrite", file)
69
+ run_fio(label, mount, "randrw", file)
70
+
71
+ @runner.run("rm -f #{file}", allow_failure: true)
72
+ end
73
+
74
+ def run_fio(label, mount, mode, file)
75
+ result = @runner.run(
76
+ "fio --name=test --filename=#{file} --size=1G --bs=4k " \
77
+ "--rw=#{mode} --direct=1 --ioengine=libaio --iodepth=64 " \
78
+ "--numjobs=4 --group_reporting --runtime=10 --time_based " \
79
+ "--output-format=json 2>/dev/null",
80
+ allow_failure: true
81
+ )
82
+ print_fio_result(" #{mode}", result)
83
+ end
84
+
85
+ def print_fio_result(label, result)
86
+ return if @runner.dry_run?
87
+ return $stdout.puts("#{label}: could not run") unless result.success?
88
+
89
+ data = JSON.parse(result.stdout) rescue return
90
+ job = data.dig("jobs", 0)
91
+ return unless job
92
+
93
+ read_iops = job.dig("read", "iops")&.round
94
+ write_iops = job.dig("write", "iops")&.round
95
+ read_lat = job.dig("read", "lat_ns", "mean")
96
+ write_lat = job.dig("write", "lat_ns", "mean")
97
+
98
+ parts = []
99
+ if read_iops && read_iops > 0
100
+ lat_us = read_lat ? " (#{(read_lat / 1000.0).round(1)}us)" : ""
101
+ parts << "read #{read_iops} iops#{lat_us}"
102
+ end
103
+ if write_iops && write_iops > 0
104
+ lat_us = write_lat ? " (#{(write_lat / 1000.0).round(1)}us)" : ""
105
+ parts << "write #{write_iops} iops#{lat_us}"
106
+ end
107
+
108
+ $stdout.puts("#{label}: #{parts.join(", ")}")
109
+ end
110
+
111
+ def run_write(mount)
112
+ file = "#{mount}/#{TEST_FILE}"
113
+ result = @runner.run(
114
+ "dd if=/dev/zero of=#{file} bs=#{BLOCK_SIZE} count=#{COUNT} conv=fdatasync 2>&1",
115
+ allow_failure: true
116
+ )
117
+ print_dd_result(" write", result)
118
+ end
119
+
120
+ def run_read(mount)
121
+ file = "#{mount}/#{TEST_FILE}"
122
+ result = @runner.run(
123
+ "dd if=#{file} of=/dev/null bs=#{BLOCK_SIZE} count=#{COUNT} 2>&1",
124
+ allow_failure: true
125
+ )
126
+ print_dd_result(" read", result)
127
+ end
128
+
129
+ def run_cross_device(from_mount, to_mount)
130
+ from_file = "#{from_mount}/#{TEST_FILE}"
131
+ to_file = "#{to_mount}/#{TEST_FILE}-cross"
132
+
133
+ @runner.run(
134
+ "dd if=/dev/zero of=#{from_file} bs=#{BLOCK_SIZE} count=#{COUNT} conv=fdatasync 2>&1",
135
+ allow_failure: true
136
+ )
137
+ clear_cache
138
+
139
+ $stdout.puts("wal -> data (#{from_mount} -> #{to_mount}):")
140
+ result = @runner.run(
141
+ "dd if=#{from_file} of=#{to_file} bs=#{BLOCK_SIZE} count=#{COUNT} conv=fdatasync 2>&1",
142
+ allow_failure: true
143
+ )
144
+ print_dd_result(" copy", result)
145
+
146
+ @runner.run("rm -f #{from_file} #{to_file}", allow_failure: true)
147
+ end
148
+
149
+ def clear_cache
150
+ @runner.run("sync && echo 3 > /proc/sys/vm/drop_caches", allow_failure: true)
151
+ end
152
+
153
+ def cleanup(mount)
154
+ @runner.run("rm -f #{mount}/#{TEST_FILE}", allow_failure: true)
155
+ end
156
+
157
+ def print_dd_result(label, result)
158
+ return if @runner.dry_run?
159
+
160
+ output = result.stdout
161
+ speed_line = output.lines.find { |l| l.include?("/s") }
162
+ if speed_line
163
+ speed = speed_line.strip.split(",").last.strip
164
+ bytes = speed_line.strip.match(/(\d[\d.]*\s*[kMGT]?B)\s+copied/)
165
+ size = bytes ? bytes[1] : "#{COUNT * 1} MB"
166
+ $stdout.puts("#{label}: #{size}, #{speed}")
167
+ else
168
+ $stdout.puts("#{label}: #{output.lines.last&.strip}")
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mahout
4
+ class ExtensionRegistry
5
+ Extension = Data.define(
6
+ :name,
7
+ :package,
8
+ :preload,
9
+ :depends_on,
10
+ :apt_repo,
11
+ :create_options
12
+ )
13
+
14
+ EXTENSIONS = {
15
+ "pg_stat_statements" => Extension.new(
16
+ name: "pg_stat_statements",
17
+ package: nil,
18
+ preload: true,
19
+ depends_on: [],
20
+ apt_repo: nil,
21
+ create_options: nil
22
+ ),
23
+ "pg_trgm" => Extension.new(
24
+ name: "pg_trgm",
25
+ package: nil,
26
+ preload: false,
27
+ depends_on: [],
28
+ apt_repo: nil,
29
+ create_options: nil
30
+ ),
31
+ "btree_gist" => Extension.new(
32
+ name: "btree_gist",
33
+ package: nil,
34
+ preload: false,
35
+ depends_on: [],
36
+ apt_repo: nil,
37
+ create_options: nil
38
+ ),
39
+ "pgcrypto" => Extension.new(
40
+ name: "pgcrypto",
41
+ package: nil,
42
+ preload: false,
43
+ depends_on: [],
44
+ apt_repo: nil,
45
+ create_options: nil
46
+ ),
47
+ "postgis" => Extension.new(
48
+ name: "postgis",
49
+ package: "postgresql-%{version}-postgis-3",
50
+ preload: false,
51
+ depends_on: [],
52
+ apt_repo: nil,
53
+ create_options: nil
54
+ ),
55
+ "timescaledb" => Extension.new(
56
+ name: "timescaledb",
57
+ package: "timescaledb-2-postgresql-%{version}",
58
+ preload: true,
59
+ depends_on: [],
60
+ apt_repo: "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ %{codename} main",
61
+ create_options: nil
62
+ ),
63
+ "pg_cron" => Extension.new(
64
+ name: "pg_cron",
65
+ package: "postgresql-%{version}-cron",
66
+ preload: true,
67
+ depends_on: [],
68
+ apt_repo: nil,
69
+ create_options: nil
70
+ ),
71
+ "pg_partman" => Extension.new(
72
+ name: "pg_partman",
73
+ package: "postgresql-%{version}-partman",
74
+ preload: true,
75
+ depends_on: [],
76
+ apt_repo: nil,
77
+ create_options: nil
78
+ ),
79
+ "hstore" => Extension.new(
80
+ name: "hstore",
81
+ package: nil,
82
+ preload: false,
83
+ depends_on: [],
84
+ apt_repo: nil,
85
+ create_options: nil
86
+ ),
87
+ "pg_repack" => Extension.new(
88
+ name: "pg_repack",
89
+ package: "postgresql-%{version}-repack",
90
+ preload: true,
91
+ depends_on: [],
92
+ apt_repo: nil,
93
+ create_options: nil
94
+ ),
95
+ "pgvector" => Extension.new(
96
+ name: "vector",
97
+ package: "postgresql-%{version}-pgvector",
98
+ preload: false,
99
+ depends_on: [],
100
+ apt_repo: nil,
101
+ create_options: nil
102
+ ),
103
+ "pg_prewarm" => Extension.new(
104
+ name: "pg_prewarm",
105
+ package: nil,
106
+ preload: true,
107
+ depends_on: [],
108
+ apt_repo: nil,
109
+ create_options: nil
110
+ )
111
+ }.freeze
112
+
113
+ class << self
114
+ def resolve(names, pg_version:)
115
+ resolved = []
116
+ names.each do |name|
117
+ ext = EXTENSIONS.fetch(name) do
118
+ raise UnknownExtension, "unknown extension: #{name}. known: #{EXTENSIONS.keys.join(", ")}"
119
+ end
120
+ ext.depends_on.each { |dep| resolved << EXTENSIONS.fetch(dep) }
121
+ resolved << ext
122
+ end
123
+ resolved.uniq(&:name)
124
+ end
125
+
126
+ def packages(extensions, pg_version:)
127
+ extensions
128
+ .filter_map { |ext| ext.package&.% ({ version: pg_version }) }
129
+ .uniq
130
+ end
131
+
132
+ def preload_libraries(extensions)
133
+ extensions.select(&:preload).map(&:name)
134
+ end
135
+
136
+ def apt_repos(extensions, pg_version:, codename:)
137
+ extensions
138
+ .filter_map(&:apt_repo)
139
+ .map { |repo| repo % { version: pg_version, codename: codename } }
140
+ .uniq
141
+ end
142
+ end
143
+ end
144
+ end