data_sampler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_sampler.gemspec
4
+ gemspec
5
+
6
+ # These are here to coerce RubyMine into recognising these dependencies
7
+ gem "activerecord"
8
+ gem "schema_plus"
9
+
data/README ADDED
@@ -0,0 +1,47 @@
1
+
2
+ Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
3
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
4
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
5
+
6
+ COMMANDS:
7
+
8
+ help Display global or [command] help documentation.
9
+ sample Extract a sample from the given connection
10
+
11
+ OPTIONS:
12
+
13
+ --adapter NAME
14
+ ActiveRecord adapter to use
15
+
16
+ --database NAME
17
+ Name of database to sample
18
+
19
+ --username USER
20
+ Username for connection
21
+
22
+ --password PASSWORD
23
+ Password for connection
24
+
25
+ --encoding ENCODING
26
+ Encoding for connection
27
+
28
+ --socket PATH
29
+ Socket for connection
30
+
31
+ --rows NUM
32
+ Number of rows to sample per table
33
+
34
+ --log PATH
35
+ Log queries to PATH
36
+
37
+ GLOBAL OPTIONS:
38
+
39
+ -h, --help
40
+ Display help documentation
41
+
42
+ -v, --version
43
+ Display version information
44
+
45
+ -t, --trace
46
+ Display backtrace when an error occurs
47
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+
data/bin/data_sampler ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ require "data_sampler"
3
+ require "commander/import"
4
+ require "logger"
5
+
6
+ program :version, DataSampler::VERSION
7
+ program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
8
+ default_command :sample
9
+
10
+ command :sample do |c|
11
+ c.description = 'Extract a sample from the given connection'
12
+ c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
13
+ c.option '--database NAME', String, 'Name of database to sample'
14
+ c.option '--username USER', String, 'Username for connection'
15
+ c.option '--password PASSWORD', String, 'Password for connection'
16
+ c.option '--encoding ENCODING', String, 'Encoding for connection'
17
+ c.option '--socket PATH', String, 'Socket for connection'
18
+ c.option '--rows NUM', Integer, 'Number of rows to sample per table'
19
+ c.option '--log PATH', String, 'Log queries to PATH'
20
+ c.when_called do |args, options|
21
+ options.default \
22
+ :adapter => 'mysql',
23
+ :database => 'test',
24
+ :username => 'root',
25
+ :encoding => 'utf8',
26
+ :socket => '/opt/local/var/run/mysql5/mysqld.sock',
27
+ :rows => 1000
28
+ ActiveRecord::Base.logger = Logger.new(options.log) if options.log
29
+ ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
30
+ puts DataSampler::Sample.new(conn, options.rows).to_sql
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_sampler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_sampler"
7
+ s.version = DataSampler::VERSION
8
+ s.authors = ["Christian Rishoj"]
9
+ s.email = ["christian@rishoj.net"]
10
+ s.homepage = "https://github.com/crishoj/data_sampler"
11
+ s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
12
+ s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
13
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
14
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
15
+
16
+ s.rubyforge_project = "data_sampler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "schema_plus"
24
+ s.add_dependency "activerecord"
25
+ end
@@ -0,0 +1,21 @@
1
+ module DataSampler
2
+ class Dependency
3
+
4
+ attr_reader :table_name
5
+ attr_reader :keys
6
+
7
+ def initialize(table_name, keys)
8
+ @table_name = table_name
9
+ @keys = keys
10
+ end
11
+
12
+ def eql? other
13
+ table_name == other.table_name and keys == other.keys
14
+ end
15
+
16
+ def to_s
17
+ "#{keys} in table #{table_name}"
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,41 @@
1
+ require "data_sampler/table_sample"
2
+
3
+ module DataSampler
4
+
5
+ class Sample
6
+
7
+ def initialize(connection, rows_per_table = 1000)
8
+ @connection = connection
9
+ @rows_per_table = rows_per_table
10
+ @table_samples = {}
11
+ @computed = false
12
+ end
13
+
14
+ def compute!
15
+ @connection.tables.each do |table_name|
16
+ # Workaround for inconsistent casing in table definitions (http://bugs.mysql.com/bug.php?id=60773)
17
+ table_name.downcase!
18
+ @table_samples[table_name] = TableSample.new(@connection, table_name, @rows_per_table)
19
+ end
20
+ warn "Sampling #{@table_samples.count} tables..."
21
+ @table_samples.values.map &:sample!
22
+ warn "Ensuring referential integrity..."
23
+ begin
24
+ new_dependencies = 0
25
+ @table_samples.values.each do |table_sample|
26
+ new_dependencies += 1 if table_sample.ensure_referential_integrity(@table_samples)
27
+ end
28
+ warn " - discovered #{new_dependencies} new dependencies" if new_dependencies > 0
29
+ end while new_dependencies > 0
30
+ warn " - referential integrity obtained"
31
+ @computed = true
32
+ end
33
+
34
+ def to_sql
35
+ compute! unless @computed
36
+ @table_samples.values.collect(&:to_sql) * "\n"
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,117 @@
1
+ require "data_sampler/dependency"
2
+
3
+ module DataSampler
4
+
5
+ class TableSample
6
+
7
+ attr_reader :table_name
8
+ attr_reader :pending_dependencies
9
+
10
+ def initialize(connection, table_name, size = 1000)
11
+ @table_name = table_name
12
+ @connection = connection
13
+ @size = size
14
+ @pending_dependencies = Set.new
15
+ @sample = Set.new
16
+ @sampled = false
17
+ @sampled_ids = Set.new
18
+ end
19
+
20
+ def sample!
21
+ fetch_sample(@size) unless @sampled
22
+ @sample
23
+ end
24
+
25
+ def fulfil(dependency)
26
+ return if fulfilled?(dependency)
27
+ where = dependency.keys.collect { |col, val| "#{@connection.quote_column_name col} = #{@connection.quote val}" } * ' AND '
28
+ sql = "SELECT * FROM #{@connection.quote_table_name @table_name} WHERE " + where
29
+ add @connection.select_one(sql)
30
+ end
31
+
32
+ def fulfilled?(dependency)
33
+ # FIXME: Only checks id column
34
+ if dependency.keys.values.size == 1
35
+ dependency.keys.each_pair do |key, val|
36
+ if key == 'id'
37
+ return true if @sampled_ids.include?(val)
38
+ end
39
+ end
40
+ end
41
+ false
42
+ end
43
+
44
+ def add(row)
45
+ return false unless @sample.add? row
46
+ @sampled_ids.add row['id'] if row['id']
47
+ any_new = false
48
+ dependencies_for(row).each do |dep|
49
+ any_new = true if @pending_dependencies.add?(dep)
50
+ end
51
+ any_new
52
+ rescue ActiveRecord::StatementInvalid => e
53
+ # Don't choke on unknown table engines, such as Sphinx
54
+ end
55
+
56
+ def ensure_referential_integrity(table_samples)
57
+ any_new = false
58
+ deps_in_progress = @pending_dependencies
59
+ @pending_dependencies = Set.new
60
+ deps_in_progress.each do |dependency|
61
+ any_new = true if table_samples[dependency.table_name].fulfil(dependency)
62
+ end
63
+ any_new
64
+ end
65
+
66
+ def to_sql
67
+ ret = ["-- #{@table_name}: #{@sample.count} rows"]
68
+ unless @sample.empty?
69
+ quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
70
+ sql = "INSERT INTO #{@connection.quote_table_name @table_name} (#{quoted_cols * ','})"
71
+ @sample.each do |row|
72
+ quoted_vals = row.values.collect { |val| @connection.quote val }
73
+ ret << sql + " VALUES (#{quoted_vals * ','})"
74
+ end
75
+ end
76
+ ret * "\n"
77
+ end
78
+
79
+ protected
80
+
81
+ def fetch_sample(count)
82
+ sql = "SELECT * FROM #{@connection.quote_table_name @table_name}"
83
+ pk = @connection.primary_key(@table_name)
84
+ sql += " ORDER BY #{@connection.quote_column_name pk} DESC" unless pk.nil?
85
+ sql += " LIMIT #{count}"
86
+ @connection.select_all(sql).each { |row| add(row) }
87
+ rescue ActiveRecord::StatementInvalid => e
88
+ # Don't choke on unknown table engines, such as Sphinx
89
+ []
90
+ end
91
+
92
+ def samplable?
93
+ # We shouldn't be sampling views
94
+ @connection.views.grep(@table_name).empty?
95
+ end
96
+
97
+ def dependency_for(fk, row)
98
+ ref = {}
99
+ cols = fk.column_names.dup
100
+ raise "No column names in foreign key #{fk.inspect}" if cols.empty?
101
+ fk.references_column_names.each do |ref_col|
102
+ col = cols.shift
103
+ ref[ref_col] = row[col] unless row[col].nil?
104
+ end
105
+ Dependency.new(fk.references_table_name, ref) unless ref.empty?
106
+ end
107
+
108
+ def dependencies_for(row)
109
+ foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
110
+ end
111
+
112
+ def foreign_keys
113
+ @fks ||= @connection.foreign_keys(@table_name)
114
+ end
115
+
116
+ end
117
+ end
@@ -0,0 +1,3 @@
1
+ module DataSampler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,5 @@
1
+ require "data_sampler/version"
2
+ require "data_sampler/sample"
3
+ require "active_record"
4
+ require 'schema_plus'
5
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_sampler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Christian Rishoj
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-03 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: schema_plus
16
+ requirement: &70132762292640 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70132762292640
25
+ - !ruby/object:Gem::Dependency
26
+ name: activerecord
27
+ requirement: &70132762292220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70132762292220
36
+ description: ! 'Ever found yourself wanting a modest amount of fresh rows from a production
37
+ database for development purposes, but
38
+
39
+ put back by the need to maintain referential integrity in the extracted data sample?
40
+ This data sampler utility will
41
+
42
+ take care that referential dependencies are fulfilled by recursively fetching any
43
+ rows referred to by the sample.'
44
+ email:
45
+ - christian@rishoj.net
46
+ executables:
47
+ - data_sampler
48
+ extensions: []
49
+ extra_rdoc_files: []
50
+ files:
51
+ - .gitignore
52
+ - Gemfile
53
+ - README
54
+ - Rakefile
55
+ - bin/data_sampler
56
+ - data_sampler.gemspec
57
+ - lib/data_sampler.rb
58
+ - lib/data_sampler/dependency.rb
59
+ - lib/data_sampler/sample.rb
60
+ - lib/data_sampler/table_sample.rb
61
+ - lib/data_sampler/version.rb
62
+ homepage: https://github.com/crishoj/data_sampler
63
+ licenses: []
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ! '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ! '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project: data_sampler
82
+ rubygems_version: 1.8.6
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Extract a sample of records from a database while maintaining referential
86
+ integrity.
87
+ test_files: []