data_sampler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +9 -0
- data/README +47 -0
- data/Rakefile +2 -0
- data/bin/data_sampler +34 -0
- data/data_sampler.gemspec +25 -0
- data/lib/data_sampler/dependency.rb +21 -0
- data/lib/data_sampler/sample.rb +41 -0
- data/lib/data_sampler/table_sample.rb +117 -0
- data/lib/data_sampler/version.rb +3 -0
- data/lib/data_sampler.rb +5 -0
- metadata +87 -0
data/Gemfile
ADDED
data/README
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
|
3
|
+
put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
|
4
|
+
take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
|
5
|
+
|
6
|
+
COMMANDS:
|
7
|
+
|
8
|
+
help Display global or [command] help documentation.
|
9
|
+
sample Extract a sample from the given connection
|
10
|
+
|
11
|
+
OPTIONS:
|
12
|
+
|
13
|
+
--adapter NAME
|
14
|
+
ActiveRecord adapter to use
|
15
|
+
|
16
|
+
--database NAME
|
17
|
+
Name of database to sample
|
18
|
+
|
19
|
+
--username USER
|
20
|
+
Username for connection
|
21
|
+
|
22
|
+
--password PASSWORD
|
23
|
+
Password for connection
|
24
|
+
|
25
|
+
--encoding ENCODING
|
26
|
+
Encoding for connection
|
27
|
+
|
28
|
+
--socket PATH
|
29
|
+
Socket for connection
|
30
|
+
|
31
|
+
--rows NUM
|
32
|
+
Number of rows to sample per table
|
33
|
+
|
34
|
+
--log PATH
|
35
|
+
Log queries to PATH
|
36
|
+
|
37
|
+
GLOBAL OPTIONS:
|
38
|
+
|
39
|
+
-h, --help
|
40
|
+
Display help documentation
|
41
|
+
|
42
|
+
-v, --version
|
43
|
+
Display version information
|
44
|
+
|
45
|
+
-t, --trace
|
46
|
+
Display backtrace when an error occurs
|
47
|
+
|
data/Rakefile
ADDED
data/bin/data_sampler
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "data_sampler"
|
3
|
+
require "commander/import"
|
4
|
+
require "logger"
|
5
|
+
|
6
|
+
program :version, DataSampler::VERSION
|
7
|
+
program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
|
8
|
+
default_command :sample
|
9
|
+
|
10
|
+
command :sample do |c|
|
11
|
+
c.description = 'Extract a sample from the given connection'
|
12
|
+
c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
|
13
|
+
c.option '--database NAME', String, 'Name of database to sample'
|
14
|
+
c.option '--username USER', String, 'Username for connection'
|
15
|
+
c.option '--password PASSWORD', String, 'Password for connection'
|
16
|
+
c.option '--encoding ENCODING', String, 'Encoding for connection'
|
17
|
+
c.option '--socket PATH', String, 'Socket for connection'
|
18
|
+
c.option '--rows NUM', Integer, 'Number of rows to sample per table'
|
19
|
+
c.option '--log PATH', String, 'Log queries to PATH'
|
20
|
+
c.when_called do |args, options|
|
21
|
+
options.default \
|
22
|
+
:adapter => 'mysql',
|
23
|
+
:database => 'test',
|
24
|
+
:username => 'root',
|
25
|
+
:encoding => 'utf8',
|
26
|
+
:socket => '/opt/local/var/run/mysql5/mysqld.sock',
|
27
|
+
:rows => 1000
|
28
|
+
ActiveRecord::Base.logger = Logger.new(options.log) if options.log
|
29
|
+
ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
|
30
|
+
puts DataSampler::Sample.new(conn, options.rows).to_sql
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "data_sampler/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "data_sampler"
|
7
|
+
s.version = DataSampler::VERSION
|
8
|
+
s.authors = ["Christian Rishoj"]
|
9
|
+
s.email = ["christian@rishoj.net"]
|
10
|
+
s.homepage = "https://github.com/crishoj/data_sampler"
|
11
|
+
s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
|
12
|
+
s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
|
13
|
+
put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
|
14
|
+
take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
|
15
|
+
|
16
|
+
s.rubyforge_project = "data_sampler"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_dependency "schema_plus"
|
24
|
+
s.add_dependency "activerecord"
|
25
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DataSampler
|
2
|
+
class Dependency
|
3
|
+
|
4
|
+
attr_reader :table_name
|
5
|
+
attr_reader :keys
|
6
|
+
|
7
|
+
def initialize(table_name, keys)
|
8
|
+
@table_name = table_name
|
9
|
+
@keys = keys
|
10
|
+
end
|
11
|
+
|
12
|
+
def eql? other
|
13
|
+
table_name == other.table_name and keys == other.keys
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
"#{keys} in table #{table_name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "data_sampler/table_sample"
|
2
|
+
|
3
|
+
module DataSampler
|
4
|
+
|
5
|
+
class Sample
|
6
|
+
|
7
|
+
def initialize(connection, rows_per_table = 1000)
|
8
|
+
@connection = connection
|
9
|
+
@rows_per_table = rows_per_table
|
10
|
+
@table_samples = {}
|
11
|
+
@computed = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def compute!
|
15
|
+
@connection.tables.each do |table_name|
|
16
|
+
# Workaround for inconsistent casing in table definitions (http://bugs.mysql.com/bug.php?id=60773)
|
17
|
+
table_name.downcase!
|
18
|
+
@table_samples[table_name] = TableSample.new(@connection, table_name, @rows_per_table)
|
19
|
+
end
|
20
|
+
warn "Sampling #{@table_samples.count} tables..."
|
21
|
+
@table_samples.values.map &:sample!
|
22
|
+
warn "Ensuring referential integrity..."
|
23
|
+
begin
|
24
|
+
new_dependencies = 0
|
25
|
+
@table_samples.values.each do |table_sample|
|
26
|
+
new_dependencies += 1 if table_sample.ensure_referential_integrity(@table_samples)
|
27
|
+
end
|
28
|
+
warn " - discovered #{new_dependencies} new dependencies" if new_dependencies > 0
|
29
|
+
end while new_dependencies > 0
|
30
|
+
warn " - referential integrity obtained"
|
31
|
+
@computed = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_sql
|
35
|
+
compute! unless @computed
|
36
|
+
@table_samples.values.collect(&:to_sql) * "\n"
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require "data_sampler/dependency"
|
2
|
+
|
3
|
+
module DataSampler
|
4
|
+
|
5
|
+
class TableSample
|
6
|
+
|
7
|
+
attr_reader :table_name
|
8
|
+
attr_reader :pending_dependencies
|
9
|
+
|
10
|
+
def initialize(connection, table_name, size = 1000)
|
11
|
+
@table_name = table_name
|
12
|
+
@connection = connection
|
13
|
+
@size = size
|
14
|
+
@pending_dependencies = Set.new
|
15
|
+
@sample = Set.new
|
16
|
+
@sampled = false
|
17
|
+
@sampled_ids = Set.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def sample!
|
21
|
+
fetch_sample(@size) unless @sampled
|
22
|
+
@sample
|
23
|
+
end
|
24
|
+
|
25
|
+
def fulfil(dependency)
|
26
|
+
return if fulfilled?(dependency)
|
27
|
+
where = dependency.keys.collect { |col, val| "#{@connection.quote_column_name col} = #{@connection.quote val}" } * ' AND '
|
28
|
+
sql = "SELECT * FROM #{@connection.quote_table_name @table_name} WHERE " + where
|
29
|
+
add @connection.select_one(sql)
|
30
|
+
end
|
31
|
+
|
32
|
+
def fulfilled?(dependency)
|
33
|
+
# FIXME: Only checks id column
|
34
|
+
if dependency.keys.values.size == 1
|
35
|
+
dependency.keys.each_pair do |key, val|
|
36
|
+
if key == 'id'
|
37
|
+
return true if @sampled_ids.include?(val)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
false
|
42
|
+
end
|
43
|
+
|
44
|
+
def add(row)
|
45
|
+
return false unless @sample.add? row
|
46
|
+
@sampled_ids.add row['id'] if row['id']
|
47
|
+
any_new = false
|
48
|
+
dependencies_for(row).each do |dep|
|
49
|
+
any_new = true if @pending_dependencies.add?(dep)
|
50
|
+
end
|
51
|
+
any_new
|
52
|
+
rescue ActiveRecord::StatementInvalid => e
|
53
|
+
# Don't choke on unknown table engines, such as Sphinx
|
54
|
+
end
|
55
|
+
|
56
|
+
def ensure_referential_integrity(table_samples)
|
57
|
+
any_new = false
|
58
|
+
deps_in_progress = @pending_dependencies
|
59
|
+
@pending_dependencies = Set.new
|
60
|
+
deps_in_progress.each do |dependency|
|
61
|
+
any_new = true if table_samples[dependency.table_name].fulfil(dependency)
|
62
|
+
end
|
63
|
+
any_new
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_sql
|
67
|
+
ret = ["-- #{@table_name}: #{@sample.count} rows"]
|
68
|
+
unless @sample.empty?
|
69
|
+
quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
|
70
|
+
sql = "INSERT INTO #{@connection.quote_table_name @table_name} (#{quoted_cols * ','})"
|
71
|
+
@sample.each do |row|
|
72
|
+
quoted_vals = row.values.collect { |val| @connection.quote val }
|
73
|
+
ret << sql + " VALUES (#{quoted_vals * ','})"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
ret * "\n"
|
77
|
+
end
|
78
|
+
|
79
|
+
protected
|
80
|
+
|
81
|
+
def fetch_sample(count)
|
82
|
+
sql = "SELECT * FROM #{@connection.quote_table_name @table_name}"
|
83
|
+
pk = @connection.primary_key(@table_name)
|
84
|
+
sql += " ORDER BY #{@connection.quote_column_name pk} DESC" unless pk.nil?
|
85
|
+
sql += " LIMIT #{count}"
|
86
|
+
@connection.select_all(sql).each { |row| add(row) }
|
87
|
+
rescue ActiveRecord::StatementInvalid => e
|
88
|
+
# Don't choke on unknown table engines, such as Sphinx
|
89
|
+
[]
|
90
|
+
end
|
91
|
+
|
92
|
+
def samplable?
|
93
|
+
# We shouldn't be sampling views
|
94
|
+
@connection.views.grep(@table_name).empty?
|
95
|
+
end
|
96
|
+
|
97
|
+
def dependency_for(fk, row)
|
98
|
+
ref = {}
|
99
|
+
cols = fk.column_names.dup
|
100
|
+
raise "No column names in foreign key #{fk.inspect}" if cols.empty?
|
101
|
+
fk.references_column_names.each do |ref_col|
|
102
|
+
col = cols.shift
|
103
|
+
ref[ref_col] = row[col] unless row[col].nil?
|
104
|
+
end
|
105
|
+
Dependency.new(fk.references_table_name, ref) unless ref.empty?
|
106
|
+
end
|
107
|
+
|
108
|
+
def dependencies_for(row)
|
109
|
+
foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
|
110
|
+
end
|
111
|
+
|
112
|
+
def foreign_keys
|
113
|
+
@fks ||= @connection.foreign_keys(@table_name)
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
end
|
data/lib/data_sampler.rb
ADDED
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: data_sampler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Christian Rishoj
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-03 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: schema_plus
|
16
|
+
requirement: &70132762292640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70132762292640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activerecord
|
27
|
+
requirement: &70132762292220 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70132762292220
|
36
|
+
description: ! 'Ever found yourself wanting a modest amount of fresh rows from a production
|
37
|
+
database for development purposes, but
|
38
|
+
|
39
|
+
put back by the need to maintain referential integrity in the extracted data sample?
|
40
|
+
This data sampler utility will
|
41
|
+
|
42
|
+
take care that referential dependencies are fulfilled by recursively fetching any
|
43
|
+
rows referred to by the sample.'
|
44
|
+
email:
|
45
|
+
- christian@rishoj.net
|
46
|
+
executables:
|
47
|
+
- data_sampler
|
48
|
+
extensions: []
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- .gitignore
|
52
|
+
- Gemfile
|
53
|
+
- README
|
54
|
+
- Rakefile
|
55
|
+
- bin/data_sampler
|
56
|
+
- data_sampler.gemspec
|
57
|
+
- lib/data_sampler.rb
|
58
|
+
- lib/data_sampler/dependency.rb
|
59
|
+
- lib/data_sampler/sample.rb
|
60
|
+
- lib/data_sampler/table_sample.rb
|
61
|
+
- lib/data_sampler/version.rb
|
62
|
+
homepage: https://github.com/crishoj/data_sampler
|
63
|
+
licenses: []
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ! '>='
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ! '>='
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
requirements: []
|
81
|
+
rubyforge_project: data_sampler
|
82
|
+
rubygems_version: 1.8.6
|
83
|
+
signing_key:
|
84
|
+
specification_version: 3
|
85
|
+
summary: Extract a sample of records from a database while maintaining referential
|
86
|
+
integrity.
|
87
|
+
test_files: []
|