rdbms_sampler 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d9834c6b4cba2c6c8271f9e8f744e43d1c8350b3465db252fb85b059ea1c93cd
4
+ data.tar.gz: feee1ba4541dc3e05eb3697272f0bdb049c5bdfbfa17344ad5b9d58675df5672
5
+ SHA512:
6
+ metadata.gz: 3646c28b02866c51364d89ca2e97b847fe6d1a72bc5904e5085b5eae7267effac3ac185dca8ccda389cdc645359e106324a5a68e5bf3691ac56c9138e2e0116b
7
+ data.tar.gz: fa0096829e7bae368a20b934284400aa39fc7c87e1840a39a381d7e970e6032a01422a721ba2d9d0b3cc8b17d130ae57c514191dae71a7ab443167f8ff71073e
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
6
+ *.sql
7
+ og
8
+ *~
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rdbms_sampler.gemspec
4
+ gemspec
5
+
6
+
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ RDBMS Sampler
2
+ =============
3
+
4
+ Command line utility for extracting a sample (subset of all records) from a relational
5
+ database system (such as MySQL) while *maintaining the referential integrity* of the sample.
6
+
7
+ Description
8
+ -----------
9
+
10
+ Need e.g. 1000 rows from each of your production tables, but feel the pain of making
11
+ sure to include dependent rows, their dependents and so on, ad infinitum?
12
+
13
+ Look no further. This tiny utility will take care that referential dependencies are
14
+ fulfilled by recursively expanding the row sample with unfilled dependencies until
15
+ the sample is referentially consistent.
16
+
17
+ Installation
18
+ ------------
19
+
20
+ Install with `gem install rdbms_sampler`.
21
+
22
+ Alternatively, clone the repository and install dependencies with `bundle install`.
23
+ Then execute with `bundle exec rdbms_sampler ...`.
24
+
25
+ Commands
26
+ --------
27
+
28
+ help Display global or [command] help documentation.
29
+ sample Extract a sample from the given connection
30
+
31
+ Options
32
+ -------
33
+
34
+ --adapter NAME
35
+ ActiveRecord adapter to use
36
+
37
+ --databases NAMES
38
+ Comma-separated list of databases to sample
39
+
40
+ --username USER
41
+ Username for connection
42
+
43
+ --password PASSWORD
44
+ Password for connection
45
+
46
+ --encoding ENCODING
47
+ Encoding for connection
48
+
49
+ --host HOST
50
+ Host name or IP for connection
51
+
52
+ --socket PATH
53
+ Socket for connection
54
+
55
+ --rows NUM
56
+ Number of rows to sample per table
57
+
58
+ --log PATH
59
+ Log queries to PATH
60
+
61
+ Global Options
62
+ --------------
63
+
64
+ -h, --help
65
+ Display help documentation
66
+
67
+ -v, --version
68
+ Display version information
69
+
70
+ -t, --trace
71
+ Display backtrace when an error occurs
72
+
73
+ Usage
74
+ -----
75
+
76
+ rdbms_sampler --databases DB1,DB2 --username USER --password PASS --rows 100 > sample.sql
77
+
78
+
79
+
80
+ CAVEATS
81
+ -------
82
+
83
+ Only single-column foreign keys are currently handled.
84
+
85
+ Additionally, due to a bug in the current implementation, if a referenced column
86
+ is named anything but `id`, referenced rows might get included multiple times.
87
+
88
+ You will probably need to disable foreign key check *during import*, since inserts in
89
+ the output are not ordered with respect to referential integrity.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+
data/bin/rdbms_sampler ADDED
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+ require "rdbms_sampler"
3
+ require "commander/import"
4
+ require "logger"
5
+
6
+ program :version, RdbmsSampler::VERSION
7
+ program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
8
+ default_command :sample
9
+
10
+ command :sample do |c|
11
+ c.description = 'Extract a sample from the given connection'
12
+ c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
13
+ c.option '--databases NAMES', String, 'Comma-separated names of databases to sample'
14
+ c.option '--username USER', String, 'Username for connection'
15
+ c.option '--password PASSWORD', String, 'Password for connection'
16
+ c.option '--encoding ENCODING', String, 'Encoding for connection'
17
+ c.option '--socket PATH', String, 'Socket for connection'
18
+ c.option '--host HOST_NAME', String, 'Host name'
19
+ c.option '--rows NUM', Integer, 'Number of rows to sample per table'
20
+ c.option '--log PATH', String, 'Log queries to PATH'
21
+ c.when_called do |args, options|
22
+ options.default \
23
+ :adapter => 'mysql2',
24
+ :username => 'root',
25
+ :encoding => 'utf8',
26
+ :cast => false,
27
+ :rows => 1000
28
+ ActiveRecord::Base.logger = Logger.new(options.log) if options.log
29
+ warn 'Connecting...'
30
+ ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
31
+ schemas = options.databases.split(/,/)
32
+ puts RdbmsSampler::Sample.new(conn: conn, rows_per_table: options.rows, schemas: schemas).to_sql
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,5 @@
1
+ require "rdbms_sampler/version"
2
+ require "rdbms_sampler/sample"
3
+ require "active_record"
4
+
5
+
@@ -0,0 +1,35 @@
1
+ module RdbmsSampler
2
+ class Dependency
3
+
4
+ attr_reader :parent_schema
5
+ attr_reader :parent_table
6
+ attr_reader :parent_key
7
+ attr_reader :child_schema_name
8
+ attr_reader :child_table_name
9
+ attr_reader :child_key
10
+ attr_reader :value
11
+
12
+ def initialize(parent_schema, parent_table, parent_key, child_schema, child_table, child_key, value)
13
+ @parent_schema = parent_schema
14
+ @parent_table = parent_table
15
+ @parent_key = parent_key
16
+ @child_schema_name = child_schema
17
+ @child_table_name = child_table
18
+ @child_key = child_key
19
+ @value = value
20
+ end
21
+
22
+ def identifier
23
+ "#{child_schema_name}.#{child_table_name}"
24
+ end
25
+
26
+ def eql? other
27
+ identifier == other.identifier and child_key == other.child_key and value == other.value
28
+ end
29
+
30
+ def to_s
31
+ "reference from #{parent_schema}.#{parent_table}.#{parent_key} to #{identifier}[#{child_key}=#{value}]"
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,26 @@
1
+ module RdbmsSampler
2
+
3
+ class ForeignKey
4
+ attr_reader :constraint_name
5
+ attr_reader :schema
6
+ attr_reader :table
7
+ attr_reader :key
8
+ attr_reader :referenced_schema
9
+ attr_reader :referenced_table
10
+ attr_reader :referenced_key
11
+
12
+ def initialize(constraint_name, schema, table, key,
13
+ referenced_schema, referenced_table, referenced_key)
14
+
15
+ @constraint_name = constraint_name
16
+ @schema = schema
17
+ @table = table
18
+ @key = key
19
+ @referenced_schema = referenced_schema
20
+ @referenced_table = referenced_table
21
+ @referenced_key = referenced_key
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,76 @@
1
+ require 'rdbms_sampler/table_sample'
2
+ require 'active_support/core_ext/array'
3
+
4
+ module RdbmsSampler
5
+
6
+ class Sample
7
+
8
+ def initialize(options ={})
9
+ @connection = options[:conn]
10
+ @rows_per_table = options[:rows_per_table] || 1000
11
+ @table_samples = {}
12
+ @schemas = options[:schemas]
13
+ @computed = false
14
+ end
15
+
16
+ def compute!
17
+ quoted_schema_names = @schemas.collect do |name|
18
+ @connection.quote_table_name(name)
19
+ end
20
+ warn "Discovering tables in databases: #{quoted_schema_names.to_sentence}..."
21
+ tables_without_views.each do |schema_name, table_name|
22
+ table_sample = TableSample.new(@connection, schema_name, table_name, @rows_per_table)
23
+ @table_samples[table_sample.identifier] = table_sample
24
+ end
25
+ return warn 'No tables found!' unless @table_samples.count > 0
26
+ warn "Sampling #{@table_samples.count} tables..."
27
+ @table_samples.values.map &:sample!
28
+ warn 'Ensuring referential integrity...'
29
+ begin
30
+ new_dependencies = 0
31
+ @table_samples.values.each do |table_sample|
32
+ newly_added = table_sample.ensure_referential_integrity(self)
33
+ if newly_added > 0
34
+ new_dependencies += newly_added
35
+ warn " Expanded sample with #{newly_added} new rows referenced from table #{table_sample.quoted_name}"
36
+ end
37
+ end
38
+ warn " Discovered #{new_dependencies} new dependencies" if new_dependencies > 0
39
+ end while new_dependencies > 0
40
+ warn 'Referential integrity obtained'
41
+
42
+ warn 'Final sample contains:'
43
+ @table_samples.values.each do |table_sample|
44
+ warn " #{table_sample.size} row(s) from `#{table_sample.identifier}`"
45
+ end
46
+ @computed = true
47
+ end
48
+
49
+ # @param [Dependency]
50
+ # @return [TableSample]
51
+ def table_sample_for_dependency(dependency)
52
+ raise "Table sample for [#{dependency.identifier}] not found" unless @table_samples.include? dependency.identifier
53
+ @table_samples[dependency.identifier]
54
+ end
55
+
56
+ def to_sql
57
+ compute! unless @computed
58
+ @table_samples.values.collect(&:to_sql) * "\n"
59
+ end
60
+
61
+ private
62
+
63
+ def tables_without_views
64
+ quoted_schema_names = @schemas.collect { |name|
65
+ @connection.quote(name)
66
+ }.join(', ')
67
+ @connection.execute <<SQL
68
+ SELECT TABLE_SCHEMA, TABLE_NAME
69
+ FROM INFORMATION_SCHEMA.TABLES
70
+ WHERE TABLE_TYPE = 'BASE TABLE'
71
+ AND TABLE_SCHEMA IN (#{quoted_schema_names})
72
+ SQL
73
+ end
74
+ end
75
+
76
+ end
@@ -0,0 +1,169 @@
1
+ require 'pry'
2
+ require 'rdbms_sampler/dependency'
3
+ require 'rdbms_sampler/foreign_key'
4
+
5
+ module RdbmsSampler
6
+
7
+ class TableSample
8
+
9
+ attr_reader :pending_dependencies
10
+
11
+ def initialize(connection, schema_name, table_name, size = 1000)
12
+ @schema = schema_name
13
+ @table = table_name
14
+ @connection = connection
15
+ @size = size
16
+ @pending_dependencies = Set.new
17
+ @sample = Set.new
18
+ @sampled = false
19
+ @sampled_ids = Set.new
20
+ end
21
+
22
+ def sample!
23
+ fetch(@size) unless @sampled
24
+ @sample
25
+ end
26
+
27
+ def size
28
+ @sampled ? @sample.size : @size
29
+ end
30
+
31
+ def identifier
32
+ "#{@schema}.#{@table}"
33
+ end
34
+
35
+ # Add the given dependency to the sample
36
+ # @param [Dependency] dependency
37
+ def fulfil(dependency)
38
+ return 0 if fulfilled?(dependency)
39
+ quoted_column = @connection.quote_column_name dependency.child_key
40
+ quoted_value = @connection.quote dependency.value
41
+ sql = "SELECT * FROM #{quoted_name} WHERE #{quoted_column} = #{quoted_value}"
42
+ row = @connection.select_one(sql)
43
+ raise "Could not fulfil #{dependency} using query [#{sql}]" if row.nil?
44
+ add row
45
+ end
46
+
47
+ # @param [Dependency] dependency
48
+ def fulfilled?(dependency)
49
+ # FIXME: Only handles `id` column
50
+ return false if dependency.child_key != 'id'
51
+
52
+ @sampled_ids.include?(dependency.value)
53
+ end
54
+
55
+ # Add a row to the table sample.
56
+ # Returns number of new dependencies introduced.
57
+ def add(row)
58
+ return 0 unless @sample.add? row
59
+ @sampled_ids.add row['id'] if row['id']
60
+ dependencies_for(row).collect { |dep|
61
+ 1 if @pending_dependencies.add?(dep)
62
+ }.compact.sum
63
+ end
64
+
65
+ # @param [Sample] sample
66
+ def ensure_referential_integrity(sample)
67
+ dependencies_in_progress = @pending_dependencies
68
+ @pending_dependencies = Set.new
69
+ dependencies_in_progress.map { |dependency|
70
+ dependency_sample = sample.table_sample_for_dependency(dependency)
71
+ dependency_sample.fulfil(dependency)
72
+ }.compact.sum
73
+ end
74
+
75
+ def to_sql
76
+ ret = "\n-- Sample from #{quoted_name} (#{@sample.count} rows)\n"
77
+ unless @sample.empty?
78
+ quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
79
+ # INSERT in batches to reduce the likelihood of hitting `max_allowed_packet`
80
+ @sample.each_slice(250) do |rows|
81
+ values = rows.collect { |row|
82
+ row.values.map { |val|
83
+ @connection.quote(val)
84
+ } * ','
85
+ } * "),\n ("
86
+ ret << "INSERT INTO #{quoted_name} \n (#{quoted_cols * ','}) \nVALUES \n (#{values});\n"
87
+ end
88
+ end
89
+ ret
90
+ end
91
+
92
+ def quoted_name
93
+ @connection.quote_table_name(@schema)+'.'+@connection.quote_table_name(@table)
94
+ end
95
+
96
+ protected
97
+
98
+ def fetch(count = 1000)
99
+ sql = "SELECT * FROM #{quoted_name}"
100
+ unless (pks = self.primary_keys).count == 0
101
+ order_by = @connection.quote_column_name(pks.first)
102
+ sql += " ORDER BY #{order_by} DESC"
103
+ end
104
+ sql += " LIMIT #{count}"
105
+ warn " Sampling #{count} rows from #{quoted_name}..."
106
+ @connection.select_all(sql).each { |row| add(row) }
107
+ @sampled = true
108
+ end
109
+
110
+ # @param [ForeignKey] fk
111
+ # @param [Array] row
112
+ def dependency_for(fk, row)
113
+ unless (value = row[fk.key]).nil?
114
+ Dependency.new(fk.schema, fk.table, fk.key, fk.referenced_schema, fk.referenced_table, fk.referenced_key, value)
115
+ end
116
+ end
117
+
118
+ # @param [Array] row
119
+ def dependencies_for(row)
120
+ foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
121
+ end
122
+
123
+ def foreign_keys
124
+ @fks ||= discover_foreign_keys
125
+ end
126
+
127
+ def discover_foreign_keys
128
+ quoted_schema = @connection.quote @schema
129
+ quoted_table = @connection.quote @table
130
+
131
+ sql = <<SQL
132
+ SELECT
133
+ fk.constraint_name,
134
+ fk.table_schema,
135
+ fk.table_name,
136
+ fk.column_name,
137
+ fk.referenced_table_schema,
138
+ fk.referenced_table_name,
139
+ fk.referenced_column_name
140
+ FROM information_schema.key_column_usage fk
141
+ WHERE fk.referenced_column_name IS NOT NULL
142
+ AND fk.table_schema = #{quoted_schema}
143
+ AND fk.table_name = #{quoted_table}
144
+ SQL
145
+
146
+ @connection.execute(sql).map do |row|
147
+ ForeignKey.new(*row)
148
+ end
149
+ end
150
+
151
+ def primary_keys
152
+ quoted_schema = @connection.quote @schema
153
+ quoted_table = @connection.quote @table
154
+
155
+ sql = <<SQL
156
+ SELECT column_name
157
+ FROM information_schema.key_column_usage
158
+ WHERE constraint_name = 'PRIMARY'
159
+ AND table_schema = #{quoted_schema}
160
+ AND table_name = #{quoted_table}
161
+ SQL
162
+
163
+ @connection.execute(sql).map do |row|
164
+ row.first
165
+ end
166
+ end
167
+
168
+ end
169
+ end
@@ -0,0 +1,3 @@
1
+ module RdbmsSampler
2
+ VERSION = "1.1.1"
3
+ end
@@ -0,0 +1,29 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "rdbms_sampler/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "rdbms_sampler"
6
+ s.version = RdbmsSampler::VERSION
7
+ s.licenses = ['MIT']
8
+ s.authors = ["Christian Rishoj"]
9
+ s.email = ["christian@rishoj.net"]
10
+ s.homepage = "https://github.com/crishoj/rdbms_sampler"
11
+ s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
12
+ s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
13
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
14
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
15
+
16
+ s.rubyforge_project = "rdbms_sampler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "schema_plus_foreign_keys", '~> 0.1', '>= 0.1.7'
24
+ s.add_dependency "activerecord", '~> 5.0'
25
+ s.add_dependency "commander", '~> 4.4'
26
+ s.add_dependency "mysql2", '~> 0.4', '>= 0.4.4'
27
+
28
+ s.add_development_dependency "pry", '~> 0.9'
29
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rdbms_sampler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Christian Rishoj
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: schema_plus_foreign_keys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.1.7
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '0.1'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 0.1.7
33
+ - !ruby/object:Gem::Dependency
34
+ name: activerecord
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '5.0'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '5.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: commander
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.4'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.4'
61
+ - !ruby/object:Gem::Dependency
62
+ name: mysql2
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '0.4'
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 0.4.4
71
+ type: :runtime
72
+ prerelease: false
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '0.4'
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 0.4.4
81
+ - !ruby/object:Gem::Dependency
82
+ name: pry
83
+ requirement: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - "~>"
86
+ - !ruby/object:Gem::Version
87
+ version: '0.9'
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - "~>"
93
+ - !ruby/object:Gem::Version
94
+ version: '0.9'
95
+ description: |-
96
+ Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
97
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
98
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
99
+ email:
100
+ - christian@rishoj.net
101
+ executables:
102
+ - rdbms_sampler
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - ".gitignore"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - bin/rdbms_sampler
111
+ - lib/rdbms_sampler.rb
112
+ - lib/rdbms_sampler/dependency.rb
113
+ - lib/rdbms_sampler/foreign_key.rb
114
+ - lib/rdbms_sampler/sample.rb
115
+ - lib/rdbms_sampler/table_sample.rb
116
+ - lib/rdbms_sampler/version.rb
117
+ - rdbms_sampler.gemspec
118
+ homepage: https://github.com/crishoj/rdbms_sampler
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project: rdbms_sampler
138
+ rubygems_version: 2.7.6
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Extract a sample of records from a database while maintaining referential
142
+ integrity.
143
+ test_files: []