data_sampler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_sampler.gemspec
4
+ gemspec
5
+
6
+ # These are here to coerce RubyMine into recognising these dependencies
7
+ gem "activerecord"
8
+ gem "schema_plus"
9
+
data/README ADDED
@@ -0,0 +1,47 @@
1
+
2
+ Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
3
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
4
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.
5
+
6
+ COMMANDS:
7
+
8
+ help Display global or [command] help documentation.
9
+ sample Extract a sample from the given connection
10
+
11
+ OPTIONS:
12
+
13
+ --adapter NAME
14
+ ActiveRecord adapter to use
15
+
16
+ --database NAME
17
+ Name of database to sample
18
+
19
+ --username USER
20
+ Username for connection
21
+
22
+ --password PASSWORD
23
+ Password for connection
24
+
25
+ --encoding ENCODING
26
+ Encoding for connection
27
+
28
+ --socket PATH
29
+ Socket for connection
30
+
31
+ --rows NUM
32
+ Number of rows to sample per table
33
+
34
+ --log PATH
35
+ Log queries to PATH
36
+
37
+ GLOBAL OPTIONS:
38
+
39
+ -h, --help
40
+ Display help documentation
41
+
42
+ -v, --version
43
+ Display version information
44
+
45
+ -t, --trace
46
+ Display backtrace when an error occurs
47
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+
data/bin/data_sampler ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ require "data_sampler"
3
+ require "commander/import"
4
+ require "logger"
5
+
6
+ program :version, DataSampler::VERSION
7
+ program :description, 'Extract a sample of desired size from a database while ensuring referential integrity.'
8
+ default_command :sample
9
+
10
+ command :sample do |c|
11
+ c.description = 'Extract a sample from the given connection'
12
+ c.option '--adapter NAME', String, 'ActiveRecord adapter to use'
13
+ c.option '--database NAME', String, 'Name of database to sample'
14
+ c.option '--username USER', String, 'Username for connection'
15
+ c.option '--password PASSWORD', String, 'Password for connection'
16
+ c.option '--encoding ENCODING', String, 'Encoding for connection'
17
+ c.option '--socket PATH', String, 'Socket for connection'
18
+ c.option '--rows NUM', Integer, 'Number of rows to sample per table'
19
+ c.option '--log PATH', String, 'Log queries to PATH'
20
+ c.when_called do |args, options|
21
+ options.default \
22
+ :adapter => 'mysql',
23
+ :database => 'test',
24
+ :username => 'root',
25
+ :encoding => 'utf8',
26
+ :socket => '/opt/local/var/run/mysql5/mysqld.sock',
27
+ :rows => 1000
28
+ ActiveRecord::Base.logger = Logger.new(options.log) if options.log
29
+ ActiveRecord::Base.establish_connection(options.__hash__).with_connection do |conn|
30
+ puts DataSampler::Sample.new(conn, options.rows).to_sql
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_sampler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_sampler"
7
+ s.version = DataSampler::VERSION
8
+ s.authors = ["Christian Rishoj"]
9
+ s.email = ["christian@rishoj.net"]
10
+ s.homepage = "https://github.com/crishoj/data_sampler"
11
+ s.summary = %q{Extract a sample of records from a database while maintaining referential integrity.}
12
+ s.description = %q{Ever found yourself wanting a modest amount of fresh rows from a production database for development purposes, but
13
+ put back by the need to maintain referential integrity in the extracted data sample? This data sampler utility will
14
+ take care that referential dependencies are fulfilled by recursively fetching any rows referred to by the sample.}
15
+
16
+ s.rubyforge_project = "data_sampler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "schema_plus"
24
+ s.add_dependency "activerecord"
25
+ end
@@ -0,0 +1,21 @@
1
+ module DataSampler
2
+ class Dependency
3
+
4
+ attr_reader :table_name
5
+ attr_reader :keys
6
+
7
+ def initialize(table_name, keys)
8
+ @table_name = table_name
9
+ @keys = keys
10
+ end
11
+
12
+ def eql? other
13
+ table_name == other.table_name and keys == other.keys
14
+ end
15
+
16
+ def to_s
17
+ "#{keys} in table #{table_name}"
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,41 @@
1
+ require "data_sampler/table_sample"
2
+
3
+ module DataSampler
4
+
5
+ class Sample
6
+
7
+ def initialize(connection, rows_per_table = 1000)
8
+ @connection = connection
9
+ @rows_per_table = rows_per_table
10
+ @table_samples = {}
11
+ @computed = false
12
+ end
13
+
14
+ def compute!
15
+ @connection.tables.each do |table_name|
16
+ # Workaround for inconsistent casing in table definitions (http://bugs.mysql.com/bug.php?id=60773)
17
+ table_name.downcase!
18
+ @table_samples[table_name] = TableSample.new(@connection, table_name, @rows_per_table)
19
+ end
20
+ warn "Sampling #{@table_samples.count} tables..."
21
+ @table_samples.values.map &:sample!
22
+ warn "Ensuring referential integrity..."
23
+ begin
24
+ new_dependencies = 0
25
+ @table_samples.values.each do |table_sample|
26
+ new_dependencies += 1 if table_sample.ensure_referential_integrity(@table_samples)
27
+ end
28
+ warn " - discovered #{new_dependencies} new dependencies" if new_dependencies > 0
29
+ end while new_dependencies > 0
30
+ warn " - referential integrity obtained"
31
+ @computed = true
32
+ end
33
+
34
+ def to_sql
35
+ compute! unless @computed
36
+ @table_samples.values.collect(&:to_sql) * "\n"
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,117 @@
1
+ require "data_sampler/dependency"
2
+
3
+ module DataSampler
4
+
5
+ class TableSample
6
+
7
+ attr_reader :table_name
8
+ attr_reader :pending_dependencies
9
+
10
+ def initialize(connection, table_name, size = 1000)
11
+ @table_name = table_name
12
+ @connection = connection
13
+ @size = size
14
+ @pending_dependencies = Set.new
15
+ @sample = Set.new
16
+ @sampled = false
17
+ @sampled_ids = Set.new
18
+ end
19
+
20
+ def sample!
21
+ fetch_sample(@size) unless @sampled
22
+ @sample
23
+ end
24
+
25
+ def fulfil(dependency)
26
+ return if fulfilled?(dependency)
27
+ where = dependency.keys.collect { |col, val| "#{@connection.quote_column_name col} = #{@connection.quote val}" } * ' AND '
28
+ sql = "SELECT * FROM #{@connection.quote_table_name @table_name} WHERE " + where
29
+ add @connection.select_one(sql)
30
+ end
31
+
32
+ def fulfilled?(dependency)
33
+ # FIXME: Only checks id column
34
+ if dependency.keys.values.size == 1
35
+ dependency.keys.each_pair do |key, val|
36
+ if key == 'id'
37
+ return true if @sampled_ids.include?(val)
38
+ end
39
+ end
40
+ end
41
+ false
42
+ end
43
+
44
+ def add(row)
45
+ return false unless @sample.add? row
46
+ @sampled_ids.add row['id'] if row['id']
47
+ any_new = false
48
+ dependencies_for(row).each do |dep|
49
+ any_new = true if @pending_dependencies.add?(dep)
50
+ end
51
+ any_new
52
+ rescue ActiveRecord::StatementInvalid => e
53
+ # Don't choke on unknown table engines, such as Sphinx
54
+ end
55
+
56
+ def ensure_referential_integrity(table_samples)
57
+ any_new = false
58
+ deps_in_progress = @pending_dependencies
59
+ @pending_dependencies = Set.new
60
+ deps_in_progress.each do |dependency|
61
+ any_new = true if table_samples[dependency.table_name].fulfil(dependency)
62
+ end
63
+ any_new
64
+ end
65
+
66
+ def to_sql
67
+ ret = ["-- #{@table_name}: #{@sample.count} rows"]
68
+ unless @sample.empty?
69
+ quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
70
+ sql = "INSERT INTO #{@connection.quote_table_name @table_name} (#{quoted_cols * ','})"
71
+ @sample.each do |row|
72
+ quoted_vals = row.values.collect { |val| @connection.quote val }
73
+ ret << sql + " VALUES (#{quoted_vals * ','})"
74
+ end
75
+ end
76
+ ret * "\n"
77
+ end
78
+
79
+ protected
80
+
81
+ def fetch_sample(count)
82
+ sql = "SELECT * FROM #{@connection.quote_table_name @table_name}"
83
+ pk = @connection.primary_key(@table_name)
84
+ sql += " ORDER BY #{@connection.quote_column_name pk} DESC" unless pk.nil?
85
+ sql += " LIMIT #{count}"
86
+ @connection.select_all(sql).each { |row| add(row) }
87
+ rescue ActiveRecord::StatementInvalid => e
88
+ # Don't choke on unknown table engines, such as Sphinx
89
+ []
90
+ end
91
+
92
+ def samplable?
93
+ # We shouldn't be sampling views
94
+ @connection.views.grep(@table_name).empty?
95
+ end
96
+
97
+ def dependency_for(fk, row)
98
+ ref = {}
99
+ cols = fk.column_names.dup
100
+ raise "No column names in foreign key #{fk.inspect}" if cols.empty?
101
+ fk.references_column_names.each do |ref_col|
102
+ col = cols.shift
103
+ ref[ref_col] = row[col] unless row[col].nil?
104
+ end
105
+ Dependency.new(fk.references_table_name, ref) unless ref.empty?
106
+ end
107
+
108
+ def dependencies_for(row)
109
+ foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
110
+ end
111
+
112
+ def foreign_keys
113
+ @fks ||= @connection.foreign_keys(@table_name)
114
+ end
115
+
116
+ end
117
+ end
@@ -0,0 +1,3 @@
1
+ module DataSampler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,5 @@
1
+ require "data_sampler/version"
2
+ require "data_sampler/sample"
3
+ require "active_record"
4
+ require 'schema_plus'
5
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_sampler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Christian Rishoj
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-03 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: schema_plus
16
+ requirement: &70132762292640 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70132762292640
25
+ - !ruby/object:Gem::Dependency
26
+ name: activerecord
27
+ requirement: &70132762292220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70132762292220
36
+ description: ! 'Ever found yourself wanting a modest amount of fresh rows from a production
37
+ database for development purposes, but
38
+
39
+ put back by the need to maintain referential integrity in the extracted data sample?
40
+ This data sampler utility will
41
+
42
+ take care that referential dependencies are fulfilled by recursively fetching any
43
+ rows referred to by the sample.'
44
+ email:
45
+ - christian@rishoj.net
46
+ executables:
47
+ - data_sampler
48
+ extensions: []
49
+ extra_rdoc_files: []
50
+ files:
51
+ - .gitignore
52
+ - Gemfile
53
+ - README
54
+ - Rakefile
55
+ - bin/data_sampler
56
+ - data_sampler.gemspec
57
+ - lib/data_sampler.rb
58
+ - lib/data_sampler/dependency.rb
59
+ - lib/data_sampler/sample.rb
60
+ - lib/data_sampler/table_sample.rb
61
+ - lib/data_sampler/version.rb
62
+ homepage: https://github.com/crishoj/data_sampler
63
+ licenses: []
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ! '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ! '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project: data_sampler
82
+ rubygems_version: 1.8.6
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Extract a sample of records from a database while maintaining referential
86
+ integrity.
87
+ test_files: []