data_sampler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +9 -0
- data/README +47 -0
- data/Rakefile +2 -0
- data/bin/data_sampler +34 -0
- data/data_sampler.gemspec +25 -0
- data/lib/data_sampler/dependency.rb +21 -0
- data/lib/data_sampler/sample.rb +41 -0
- data/lib/data_sampler/table_sample.rb +117 -0
- data/lib/data_sampler/version.rb +3 -0
- data/lib/data_sampler.rb +5 -0
- metadata +87 -0
data/Gemfile
ADDED
data/README
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
|
3
|
+
put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
|
4
|
+
take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
|
5
|
+
|
6
|
+
COMMANDS:
|
7
|
+
|
8
|
+
help Display global or [command] help documentation.
|
9
|
+
sample Extract a sample from the given connection
|
10
|
+
|
11
|
+
OPTIONS:
|
12
|
+
|
13
|
+
--adapter NAME
|
14
|
+
ActiveRecord adapter to use
|
15
|
+
|
16
|
+
--database NAME
|
17
|
+
Name of database to sample
|
18
|
+
|
19
|
+
--username USER
|
20
|
+
Username for connection
|
21
|
+
|
22
|
+
--password PASSWORD
|
23
|
+
Password for connection
|
24
|
+
|
25
|
+
--encoding ENCODING
|
26
|
+
Encoding for connection
|
27
|
+
|
28
|
+
--socket PATH
|
29
|
+
Socket for connection
|
30
|
+
|
31
|
+
--rows NUM
|
32
|
+
Number of rows to sample per table
|
33
|
+
|
34
|
+
--log PATH
|
35
|
+
Log queries to PATH
|
36
|
+
|
37
|
+
GLOBAL OPTIONS:
|
38
|
+
|
39
|
+
-h, --help
|
40
|
+
Display help documentation
|
41
|
+
|
42
|
+
-v, --version
|
43
|
+
Display version information
|
44
|
+
|
45
|
+
-t, --trace
|
46
|
+
Display backtrace when an error occurs
|
47
|
+
|
data/Rakefile
ADDED
data/bin/data_sampler
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "data_sampler"
|
3
|
+
require "commander/import"
|
4
|
+
require "logger"
|
5
|
+
|
6
|
+
program :version, DataSampler::VERSION
|
7
|
+
program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
|
8
|
+
default_command :sample
|
9
|
+
|
10
|
+
command :sample do |c|
|
11
|
+
c.description = 'Extract a sample from the given connection'
|
12
|
+
c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
|
13
|
+
c.option '--database NAME', String, 'Name of database to sample'
|
14
|
+
c.option '--username USER', String, 'Username for connection'
|
15
|
+
c.option '--password PASSWORD', String, 'Password for connection'
|
16
|
+
c.option '--encoding ENCODING', String, 'Encoding for connection'
|
17
|
+
c.option '--socket PATH', String, 'Socket for connection'
|
18
|
+
c.option '--rows NUM', Integer, 'Number of rows to sample per table'
|
19
|
+
c.option '--log PATH', String, 'Log queries to PATH'
|
20
|
+
c.when_called do |args, options|
|
21
|
+
options.default \
|
22
|
+
:adapter => 'mysql',
|
23
|
+
:database => 'test',
|
24
|
+
:username => 'root',
|
25
|
+
:encoding => 'utf8',
|
26
|
+
:socket => '/opt/local/var/run/mysql5/mysqld.sock',
|
27
|
+
:rows => 1000
|
28
|
+
ActiveRecord::Base.logger = Logger.new(options.log) if options.log
|
29
|
+
ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
|
30
|
+
puts DataSampler::Sample.new(conn, options.rows).to_sql
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "data_sampler/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "data_sampler"
|
7
|
+
s.version = DataSampler::VERSION
|
8
|
+
s.authors = ["Christian Rishoj"]
|
9
|
+
s.email = ["christian@rishoj.net"]
|
10
|
+
s.homepage = "https://github.com/crishoj/data_sampler"
|
11
|
+
s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
|
12
|
+
s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
|
13
|
+
put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
|
14
|
+
take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
|
15
|
+
|
16
|
+
s.rubyforge_project = "data_sampler"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_dependency "schema_plus"
|
24
|
+
s.add_dependency "activerecord"
|
25
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DataSampler
|
2
|
+
class Dependency
|
3
|
+
|
4
|
+
attr_reader :table_name
|
5
|
+
attr_reader :keys
|
6
|
+
|
7
|
+
def initialize(table_name, keys)
|
8
|
+
@table_name = table_name
|
9
|
+
@keys = keys
|
10
|
+
end
|
11
|
+
|
12
|
+
def eql? other
|
13
|
+
table_name == other.table_name and keys == other.keys
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
"#{keys} in table #{table_name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "data_sampler/table_sample"
|
2
|
+
|
3
|
+
module DataSampler
|
4
|
+
|
5
|
+
class Sample
|
6
|
+
|
7
|
+
def initialize(connection, rows_per_table = 1000)
|
8
|
+
@connection = connection
|
9
|
+
@rows_per_table = rows_per_table
|
10
|
+
@table_samples = {}
|
11
|
+
@computed = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def compute!
|
15
|
+
@connection.tables.each do |table_name|
|
16
|
+
# Workaround for inconsistent casing in table definitions (http://bugs.mysql.com/bug.php?id=60773)
|
17
|
+
table_name.downcase!
|
18
|
+
@table_samples[table_name] = TableSample.new(@connection, table_name, @rows_per_table)
|
19
|
+
end
|
20
|
+
warn "Sampling #{@table_samples.count} tables..."
|
21
|
+
@table_samples.values.map &:sample!
|
22
|
+
warn "Ensuring referential integrity..."
|
23
|
+
begin
|
24
|
+
new_dependencies = 0
|
25
|
+
@table_samples.values.each do |table_sample|
|
26
|
+
new_dependencies += 1 if table_sample.ensure_referential_integrity(@table_samples)
|
27
|
+
end
|
28
|
+
warn " - discovered #{new_dependencies} new dependencies" if new_dependencies > 0
|
29
|
+
end while new_dependencies > 0
|
30
|
+
warn " - referential integrity obtained"
|
31
|
+
@computed = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_sql
|
35
|
+
compute! unless @computed
|
36
|
+
@table_samples.values.collect(&:to_sql) * "\n"
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require "data_sampler/dependency"
|
2
|
+
|
3
|
+
module DataSampler
|
4
|
+
|
5
|
+
class TableSample
|
6
|
+
|
7
|
+
attr_reader :table_name
|
8
|
+
attr_reader :pending_dependencies
|
9
|
+
|
10
|
+
def initialize(connection, table_name, size = 1000)
|
11
|
+
@table_name = table_name
|
12
|
+
@connection = connection
|
13
|
+
@size = size
|
14
|
+
@pending_dependencies = Set.new
|
15
|
+
@sample = Set.new
|
16
|
+
@sampled = false
|
17
|
+
@sampled_ids = Set.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def sample!
|
21
|
+
fetch_sample(@size) unless @sampled
|
22
|
+
@sample
|
23
|
+
end
|
24
|
+
|
25
|
+
def fulfil(dependency)
|
26
|
+
return if fulfilled?(dependency)
|
27
|
+
where = dependency.keys.collect { |col, val| "#{@connection.quote_column_name col} = #{@connection.quote val}" } * ' AND '
|
28
|
+
sql = "SELECT * FROM #{@connection.quote_table_name @table_name} WHERE " + where
|
29
|
+
add @connection.select_one(sql)
|
30
|
+
end
|
31
|
+
|
32
|
+
def fulfilled?(dependency)
|
33
|
+
# FIXME: Only checks id column
|
34
|
+
if dependency.keys.values.size == 1
|
35
|
+
dependency.keys.each_pair do |key, val|
|
36
|
+
if key == 'id'
|
37
|
+
return true if @sampled_ids.include?(val)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
false
|
42
|
+
end
|
43
|
+
|
44
|
+
def add(row)
|
45
|
+
return false unless @sample.add? row
|
46
|
+
@sampled_ids.add row['id'] if row['id']
|
47
|
+
any_new = false
|
48
|
+
dependencies_for(row).each do |dep|
|
49
|
+
any_new = true if @pending_dependencies.add?(dep)
|
50
|
+
end
|
51
|
+
any_new
|
52
|
+
rescue ActiveRecord::StatementInvalid => e
|
53
|
+
# Don't choke on unknown table engines, such as Sphinx
|
54
|
+
end
|
55
|
+
|
56
|
+
def ensure_referential_integrity(table_samples)
|
57
|
+
any_new = false
|
58
|
+
deps_in_progress = @pending_dependencies
|
59
|
+
@pending_dependencies = Set.new
|
60
|
+
deps_in_progress.each do |dependency|
|
61
|
+
any_new = true if table_samples[dependency.table_name].fulfil(dependency)
|
62
|
+
end
|
63
|
+
any_new
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_sql
|
67
|
+
ret = ["-- #{@table_name}: #{@sample.count} rows"]
|
68
|
+
unless @sample.empty?
|
69
|
+
quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
|
70
|
+
sql = "INSERT INTO #{@connection.quote_table_name @table_name} (#{quoted_cols * ','})"
|
71
|
+
@sample.each do |row|
|
72
|
+
quoted_vals = row.values.collect { |val| @connection.quote val }
|
73
|
+
ret << sql + " VALUES (#{quoted_vals * ','})"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
ret * "\n"
|
77
|
+
end
|
78
|
+
|
79
|
+
protected
|
80
|
+
|
81
|
+
def fetch_sample(count)
|
82
|
+
sql = "SELECT * FROM #{@connection.quote_table_name @table_name}"
|
83
|
+
pk = @connection.primary_key(@table_name)
|
84
|
+
sql += " ORDER BY #{@connection.quote_column_name pk} DESC" unless pk.nil?
|
85
|
+
sql += " LIMIT #{count}"
|
86
|
+
@connection.select_all(sql).each { |row| add(row) }
|
87
|
+
rescue ActiveRecord::StatementInvalid => e
|
88
|
+
# Don't choke on unknown table engines, such as Sphinx
|
89
|
+
[]
|
90
|
+
end
|
91
|
+
|
92
|
+
def samplable?
|
93
|
+
# We shouldn't be sampling views
|
94
|
+
@connection.views.grep(@table_name).empty?
|
95
|
+
end
|
96
|
+
|
97
|
+
def dependency_for(fk, row)
|
98
|
+
ref = {}
|
99
|
+
cols = fk.column_names.dup
|
100
|
+
raise "No column names in foreign key #{fk.inspect}" if cols.empty?
|
101
|
+
fk.references_column_names.each do |ref_col|
|
102
|
+
col = cols.shift
|
103
|
+
ref[ref_col] = row[col] unless row[col].nil?
|
104
|
+
end
|
105
|
+
Dependency.new(fk.references_table_name, ref) unless ref.empty?
|
106
|
+
end
|
107
|
+
|
108
|
+
def dependencies_for(row)
|
109
|
+
foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
|
110
|
+
end
|
111
|
+
|
112
|
+
def foreign_keys
|
113
|
+
@fks ||= @connection.foreign_keys(@table_name)
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
end
|
data/lib/data_sampler.rb
ADDED
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: data_sampler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Christian Rishoj
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-03 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: schema_plus
|
16
|
+
requirement: &70132762292640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70132762292640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activerecord
|
27
|
+
requirement: &70132762292220 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70132762292220
|
36
|
+
description: ! 'Ever found yourself wanting a modest amount of fresh rows from a production
|
37
|
+
database for development purposes, but
|
38
|
+
|
39
|
+
put back by the need to maintain referential integrity in the extracted data sample?
|
40
|
+
This data sampler utility will
|
41
|
+
|
42
|
+
take care that referential dependencies are fulfilled by recursively fetching any
|
43
|
+
rows referred to by the sample.'
|
44
|
+
email:
|
45
|
+
- christian@rishoj.net
|
46
|
+
executables:
|
47
|
+
- data_sampler
|
48
|
+
extensions: []
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- .gitignore
|
52
|
+
- Gemfile
|
53
|
+
- README
|
54
|
+
- Rakefile
|
55
|
+
- bin/data_sampler
|
56
|
+
- data_sampler.gemspec
|
57
|
+
- lib/data_sampler.rb
|
58
|
+
- lib/data_sampler/dependency.rb
|
59
|
+
- lib/data_sampler/sample.rb
|
60
|
+
- lib/data_sampler/table_sample.rb
|
61
|
+
- lib/data_sampler/version.rb
|
62
|
+
homepage: https://github.com/crishoj/data_sampler
|
63
|
+
licenses: []
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ! '>='
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ! '>='
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
requirements: []
|
81
|
+
rubyforge_project: data_sampler
|
82
|
+
rubygems_version: 1.8.6
|
83
|
+
signing_key:
|
84
|
+
specification_version: 3
|
85
|
+
summary: Extract a sample of records from a database while maintaining referential
|
86
|
+
integrity.
|
87
|
+
test_files: []
|