rdbms_sampler 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d9834c6b4cba2c6c8271f9e8f744e43d1c8350b3465db252fb85b059ea1c93cd
4
+ data.tar.gz: feee1ba4541dc3e05eb3697272f0bdb049c5bdfbfa17344ad5b9d58675df5672
5
+ SHA512:
6
+ metadata.gz: 3646c28b02866c51364d89ca2e97b847fe6d1a72bc5904e5085b5eae7267effac3ac185dca8ccda389cdc645359e106324a5a68e5bf3691ac56c9138e2e0116b
7
+ data.tar.gz: fa0096829e7bae368a20b934284400aa39fc7c87e1840a39a381d7e970e6032a01422a721ba2d9d0b3cc8b17d130ae57c514191dae71a7ab443167f8ff71073e
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
6
+ *.sql
7
+ og
8
+ *~
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rdbms_sampler.gemspec
4
+ gemspec
5
+
6
+
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ RDBMS Sampler
2
+ =============
3
+
4
+ Command line utility for extracting a sample (subset of all records) from a relational
5
+ database system (such as MySQL) while *maintaining the referential integrity* of the sample.
6
+
7
+ Description
8
+ -----------
9
+
10
+ Need e.g. 1000 rows from each of your production tables, but feel the pain of making
11
+ sure to include dependent rows, their dependents and so on, ad infinitum?
12
+
13
+ Look no further. This tiny utility will take care that referential dependencies are
14
+ fulfilled by recursively expanding the row sample with unfilled dependencies until
15
+ the sample is referentially consistent.
16
+
17
+ Installation
18
+ ------------
19
+
20
+ Install with `gem install rdbms_sampler`.
21
+
22
+ Alternatively, clone the repository and install dependencies with `bundle install`.
23
+ Then execute with `bundle exec rdbms_sampler ...`.
24
+
25
+ Commands
26
+ --------
27
+
28
+ help Display global or [command] help documentation.
29
+ sample Extract a sample from the given connection
30
+
31
+ Options
32
+ -------
33
+
34
+ --adapter NAME
35
+ ActiveRecord adapter to use
36
+
37
+ --databases NAMES
38
+ Comma-separated list of databases to sample
39
+
40
+ --username USER
41
+ Username for connection
42
+
43
+ --password PASSWORD
44
+ Password for connection
45
+
46
+ --encoding ENCODING
47
+ Encoding for connection
48
+
49
+ --host HOST
50
+ Host name or IP for connection
51
+
52
+ --socket PATH
53
+ Socket for connection
54
+
55
+ --rows NUM
56
+ Number of rows to sample per table
57
+
58
+ --log PATH
59
+ Log queries to PATH
60
+
61
+ Global Options
62
+ --------------
63
+
64
+ -h, --help
65
+ Display help documentation
66
+
67
+ -v, --version
68
+ Display version information
69
+
70
+ -t, --trace
71
+ Display backtrace when an error occurs
72
+
73
+ Usage
74
+ -----
75
+
76
+ rdbms_sampler --databases DB1,DB2 --username USER --password PASS --rows 100 > sample.sql
77
+
78
+
79
+
80
+ CAVEATS
81
+ -------
82
+
83
+ Only single-column foreign keys are currently handled.
84
+
85
+ Additionally, due to a bug in the current implementation, if a referenced column
86
+ is named anything but `id`, referenced rows might get included multiple times.
87
+
88
+ You will probably need to disable foreign key check *during import*, since inserts in
89
+ the output are not ordered with respect to referential integrity.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+
data/bin/rdbms_sampler ADDED
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+ require "rdbms_sampler"
3
+ require "commander/import"
4
+ require "logger"
5
+
6
+ program :version, RdbmsSampler::VERSION
7
+ program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
8
+ default_command :sample
9
+
10
+ command :sample do |c|
11
+ c.description = 'Extract a sample from the given connection'
12
+ c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
13
+ c.option '--databases NAMES', String, 'Comma-separated names of databases to sample'
14
+ c.option '--username USER', String, 'Username for connection'
15
+ c.option '--password PASSWORD', String, 'Password for connection'
16
+ c.option '--encoding ENCODING', String, 'Encoding for connection'
17
+ c.option '--socket PATH', String, 'Socket for connection'
18
+ c.option '--host HOST_NAME', String, 'Host name'
19
+ c.option '--rows NUM', Integer, 'Number of rows to sample per table'
20
+ c.option '--log PATH', String, 'Log queries to PATH'
21
+ c.when_called do |args, options|
22
+ options.default \
23
+ :adapter => 'mysql2',
24
+ :username => 'root',
25
+ :encoding => 'utf8',
26
+ :cast => false,
27
+ :rows => 1000
28
+ ActiveRecord::Base.logger = Logger.new(options.log) if options.log
29
+ warn 'Connecting...'
30
+ ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
31
+ schemas = options.databases.split(/,/)
32
+ puts RdbmsSampler::Sample.new(conn: conn, rows_per_table: options.rows, schemas: schemas).to_sql
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,5 @@
1
+ require "rdbms_sampler/version"
2
+ require "rdbms_sampler/sample"
3
+ require "active_record"
4
+
5
+
@@ -0,0 +1,35 @@
1
+ module RdbmsSampler
2
+ class Dependency
3
+
4
+ attr_reader :parent_schema
5
+ attr_reader :parent_table
6
+ attr_reader :parent_key
7
+ attr_reader :child_schema_name
8
+ attr_reader :child_table_name
9
+ attr_reader :child_key
10
+ attr_reader :value
11
+
12
+ def initialize(parent_schema, parent_table, parent_key, child_schema, child_table, child_key, value)
13
+ @parent_schema = parent_schema
14
+ @parent_table = parent_table
15
+ @parent_key = parent_key
16
+ @child_schema_name = child_schema
17
+ @child_table_name = child_table
18
+ @child_key = child_key
19
+ @value = value
20
+ end
21
+
22
+ def identifier
23
+ "#{child_schema_name}.#{child_table_name}"
24
+ end
25
+
26
+ def eql? other
27
+ identifier == other.identifier and child_key == other.child_key and value == other.value
28
+ end
29
+
30
+ def to_s
31
+ "reference from #{parent_schema}.#{parent_table}.#{parent_key} to #{identifier}[#{child_key}=#{value}]"
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,26 @@
1
+ module RdbmsSampler
2
+
3
+ class ForeignKey
4
+ attr_reader :constraint_name
5
+ attr_reader :schema
6
+ attr_reader :table
7
+ attr_reader :key
8
+ attr_reader :referenced_schema
9
+ attr_reader :referenced_table
10
+ attr_reader :referenced_key
11
+
12
+ def initialize(constraint_name, schema, table, key,
13
+ referenced_schema, referenced_table, referenced_key)
14
+
15
+ @constraint_name = constraint_name
16
+ @schema = schema
17
+ @table = table
18
+ @key = key
19
+ @referenced_schema = referenced_schema
20
+ @referenced_table = referenced_table
21
+ @referenced_key = referenced_key
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,76 @@
1
+ require 'rdbms_sampler/table_sample'
2
+ require 'active_support/core_ext/array'
3
+
4
+ module RdbmsSampler
5
+
6
+ class Sample
7
+
8
+ def initialize(options ={})
9
+ @connection = options[:conn]
10
+ @rows_per_table = options[:rows_per_table] || 1000
11
+ @table_samples = {}
12
+ @schemas = options[:schemas]
13
+ @computed = false
14
+ end
15
+
16
+ def compute!
17
+ quoted_schema_names = @schemas.collect do |name|
18
+ @connection.quote_table_name(name)
19
+ end
20
+ warn "Discovering tables in databases: #{quoted_schema_names.to_sentence}..."
21
+ tables_without_views.each do |schema_name, table_name|
22
+ table_sample = TableSample.new(@connection, schema_name, table_name, @rows_per_table)
23
+ @table_samples[table_sample.identifier] = table_sample
24
+ end
25
+ return warn 'No tables found!' unless @table_samples.count > 0
26
+ warn "Sampling #{@table_samples.count} tables..."
27
+ @table_samples.values.map &:sample!
28
+ warn 'Ensuring referential integrity...'
29
+ begin
30
+ new_dependencies = 0
31
+ @table_samples.values.each do |table_sample|
32
+ newly_added = table_sample.ensure_referential_integrity(self)
33
+ if newly_added > 0
34
+ new_dependencies += newly_added
35
+ warn " Expanded sample with #{newly_added} new rows referenced from table #{table_sample.quoted_name}"
36
+ end
37
+ end
38
+ warn " Discovered #{new_dependencies} new dependencies" if new_dependencies > 0
39
+ end while new_dependencies > 0
40
+ warn 'Referential integrity obtained'
41
+
42
+ warn 'Final sample contains:'
43
+ @table_samples.values.each do |table_sample|
44
+ warn " #{table_sample.size} row(s) from `#{table_sample.identifier}`"
45
+ end
46
+ @computed = true
47
+ end
48
+
49
+ # @param [Dependency]
50
+ # @return [TableSample]
51
+ def table_sample_for_dependency(dependency)
52
+ raise "Table sample for [#{dependency.identifier}] not found" unless @table_samples.include? dependency.identifier
53
+ @table_samples[dependency.identifier]
54
+ end
55
+
56
+ def to_sql
57
+ compute! unless @computed
58
+ @table_samples.values.collect(&:to_sql) * "\n"
59
+ end
60
+
61
+ private
62
+
63
+ def tables_without_views
64
+ quoted_schema_names = @schemas.collect { |name|
65
+ @connection.quote(name)
66
+ }.join(', ')
67
+ @connection.execute <<SQL
68
+ SELECT TABLE_SCHEMA, TABLE_NAME
69
+ FROM INFORMATION_SCHEMA.TABLES
70
+ WHERE TABLE_TYPE = 'BASE TABLE'
71
+ AND TABLE_SCHEMA IN (#{quoted_schema_names})
72
+ SQL
73
+ end
74
+ end
75
+
76
+ end
@@ -0,0 +1,169 @@
1
+ require 'pry'
2
+ require 'rdbms_sampler/dependency'
3
+ require 'rdbms_sampler/foreign_key'
4
+
5
+ module RdbmsSampler
6
+
7
+ class TableSample
8
+
9
+ attr_reader :pending_dependencies
10
+
11
+ def initialize(connection, schema_name, table_name, size = 1000)
12
+ @schema = schema_name
13
+ @table = table_name
14
+ @connection = connection
15
+ @size = size
16
+ @pending_dependencies = Set.new
17
+ @sample = Set.new
18
+ @sampled = false
19
+ @sampled_ids = Set.new
20
+ end
21
+
22
+ def sample!
23
+ fetch(@size) unless @sampled
24
+ @sample
25
+ end
26
+
27
+ def size
28
+ @sampled ? @sample.size : @size
29
+ end
30
+
31
+ def identifier
32
+ "#{@schema}.#{@table}"
33
+ end
34
+
35
+ # Add the given dependency to the sample
36
+ # @param [Dependency] dependency
37
+ def fulfil(dependency)
38
+ return 0 if fulfilled?(dependency)
39
+ quoted_column = @connection.quote_column_name dependency.child_key
40
+ quoted_value = @connection.quote dependency.value
41
+ sql = "SELECT * FROM #{quoted_name} WHERE #{quoted_column} = #{quoted_value}"
42
+ row = @connection.select_one(sql)
43
+ raise "Could not fulfil #{dependency} using query [#{sql}]" if row.nil?
44
+ add row
45
+ end
46
+
47
+ # @param [Dependency] dependency
48
+ def fulfilled?(dependency)
49
+ # FIXME: Only handles `id` column
50
+ return false if dependency.child_key != 'id'
51
+
52
+ @sampled_ids.include?(dependency.value)
53
+ end
54
+
55
+ # Add a row to the table sample.
56
+ # Returns number of new dependencies introduced.
57
+ def add(row)
58
+ return 0 unless @sample.add? row
59
+ @sampled_ids.add row['id'] if row['id']
60
+ dependencies_for(row).collect { |dep|
61
+ 1 if @pending_dependencies.add?(dep)
62
+ }.compact.sum
63
+ end
64
+
65
+ # @param [Sample] sample
66
+ def ensure_referential_integrity(sample)
67
+ dependencies_in_progress = @pending_dependencies
68
+ @pending_dependencies = Set.new
69
+ dependencies_in_progress.map { |dependency|
70
+ dependency_sample = sample.table_sample_for_dependency(dependency)
71
+ dependency_sample.fulfil(dependency)
72
+ }.compact.sum
73
+ end
74
+
75
+ def to_sql
76
+ ret = "\n-- Sample from #{quoted_name} (#{@sample.count} rows)\n"
77
+ unless @sample.empty?
78
+ quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
79
+ # INSERT in batches to reduce the likelihood of hitting `max_allowed_packet`
80
+ @sample.each_slice(250) do |rows|
81
+ values = rows.collect { |row|
82
+ row.values.map { |val|
83
+ @connection.quote(val)
84
+ } * ','
85
+ } * "),\n ("
86
+ ret << "INSERT INTO #{quoted_name} \n (#{quoted_cols * ','}) \nVALUES \n (#{values});\n"
87
+ end
88
+ end
89
+ ret
90
+ end
91
+
92
+ def quoted_name
93
+ @connection.quote_table_name(@schema)+'.'+@connection.quote_table_name(@table)
94
+ end
95
+
96
+ protected
97
+
98
+ def fetch(count = 1000)
99
+ sql = "SELECT * FROM #{quoted_name}"
100
+ unless (pks = self.primary_keys).count == 0
101
+ order_by = @connection.quote_column_name(pks.first)
102
+ sql += " ORDER BY #{order_by} DESC"
103
+ end
104
+ sql += " LIMIT #{count}"
105
+ warn " Sampling #{count} rows from #{quoted_name}..."
106
+ @connection.select_all(sql).each { |row| add(row) }
107
+ @sampled = true
108
+ end
109
+
110
+ # @param [ForeignKey] fk
111
+ # @param [Array] row
112
+ def dependency_for(fk, row)
113
+ unless (value = row[fk.key]).nil?
114
+ Dependency.new(fk.schema, fk.table, fk.key, fk.referenced_schema, fk.referenced_table, fk.referenced_key, value)
115
+ end
116
+ end
117
+
118
+ # @param [Array] row
119
+ def dependencies_for(row)
120
+ foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
121
+ end
122
+
123
+ def foreign_keys
124
+ @fks ||= discover_foreign_keys
125
+ end
126
+
127
+ def discover_foreign_keys
128
+ quoted_schema = @connection.quote @schema
129
+ quoted_table = @connection.quote @table
130
+
131
+ sql = <<SQL
132
+ SELECT
133
+ fk.constraint_name,
134
+ fk.table_schema,
135
+ fk.table_name,
136
+ fk.column_name,
137
+ fk.referenced_table_schema,
138
+ fk.referenced_table_name,
139
+ fk.referenced_column_name
140
+ FROM information_schema.key_column_usage fk
141
+ WHERE fk.referenced_column_name IS NOT NULL
142
+ AND fk.table_schema = #{quoted_schema}
143
+ AND fk.table_name = #{quoted_table}
144
+ SQL
145
+
146
+ @connection.execute(sql).map do |row|
147
+ ForeignKey.new(*row)
148
+ end
149
+ end
150
+
151
+ def primary_keys
152
+ quoted_schema = @connection.quote @schema
153
+ quoted_table = @connection.quote @table
154
+
155
+ sql = <<SQL
156
+ SELECT column_name
157
+ FROM information_schema.key_column_usage
158
+ WHERE constraint_name = 'PRIMARY'
159
+ AND table_schema = #{quoted_schema}
160
+ AND table_name = #{quoted_table}
161
+ SQL
162
+
163
+ @connection.execute(sql).map do |row|
164
+ row.first
165
+ end
166
+ end
167
+
168
+ end
169
+ end
@@ -0,0 +1,3 @@
1
+ module RdbmsSampler
2
+ VERSION = "1.1.1"
3
+ end
@@ -0,0 +1,29 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "rdbms_sampler/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "rdbms_sampler"
6
+ s.version = RdbmsSampler::VERSION
7
+ s.licenses = ['MIT']
8
+ s.authors = ["Christian Rishoj"]
9
+ s.email = ["christian@rishoj.net"]
10
+ s.homepage = "https://github.com/crishoj/rdbms_sampler"
11
+ s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
12
+ s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
13
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
14
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
15
+
16
+ s.rubyforge_project = "rdbms_sampler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "schema_plus_foreign_keys", '~> 0.1', '>= 0.1.7'
24
+ s.add_dependency "activerecord", '~> 5.0'
25
+ s.add_dependency "commander", '~> 4.4'
26
+ s.add_dependency "mysql2", '~> 0.4', '>= 0.4.4'
27
+
28
+ s.add_development_dependency "pry", '~> 0.9'
29
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rdbms_sampler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Christian Rishoj
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: schema_plus_foreign_keys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.1.7
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '0.1'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 0.1.7
33
+ - !ruby/object:Gem::Dependency
34
+ name: activerecord
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '5.0'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '5.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: commander
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.4'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.4'
61
+ - !ruby/object:Gem::Dependency
62
+ name: mysql2
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '0.4'
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 0.4.4
71
+ type: :runtime
72
+ prerelease: false
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '0.4'
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 0.4.4
81
+ - !ruby/object:Gem::Dependency
82
+ name: pry
83
+ requirement: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - "~>"
86
+ - !ruby/object:Gem::Version
87
+ version: '0.9'
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - "~>"
93
+ - !ruby/object:Gem::Version
94
+ version: '0.9'
95
+ description: |-
96
+ Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
97
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
98
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
99
+ email:
100
+ - christian@rishoj.net
101
+ executables:
102
+ - rdbms_sampler
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - ".gitignore"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - bin/rdbms_sampler
111
+ - lib/rdbms_sampler.rb
112
+ - lib/rdbms_sampler/dependency.rb
113
+ - lib/rdbms_sampler/foreign_key.rb
114
+ - lib/rdbms_sampler/sample.rb
115
+ - lib/rdbms_sampler/table_sample.rb
116
+ - lib/rdbms_sampler/version.rb
117
+ - rdbms_sampler.gemspec
118
+ homepage: https://github.com/crishoj/rdbms_sampler
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project: rdbms_sampler
138
+ rubygems_version: 2.7.6
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Extract a sample of records from a database while maintaining referential
142
+ integrity.
143
+ test_files: []