distyll 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b258d9e95565fd009d54bc4a70893f7ec7644b2e
4
+ data.tar.gz: 06714e3d3c272d347c331e27bf9bfbf6d78f1df4
5
+ SHA512:
6
+ metadata.gz: aa597d98f6dc4ae3848ae092efc3a2e2f3f511be6a05468f89527e74004f53fccd45ceb8b8b25ea053b846b4c7f1fa5149576e20f54aed61bdc3817aaf4d9a28
7
+ data.tar.gz: 92a4ef0ea98c1ccdbb3b38564e8438aa0b5502e6c3ecdad89046dc3eb038bb4a33a7b1947d388a61d7b10327bc555b0e4312eec2fdaabdc69e7435102f2724bb
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "shoulda", ">= 0"
10
+ gem "rdoc", "~> 3.12"
11
+ gem "bundler", "~> 1.0"
12
+ gem "jeweler", "~> 2.0.1"
13
+ gem "simplecov", ">= 0"
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,81 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.1.1)
5
+ i18n (~> 0.6, >= 0.6.9)
6
+ json (~> 1.7, >= 1.7.7)
7
+ minitest (~> 5.1)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 1.1)
10
+ addressable (2.3.6)
11
+ builder (3.2.2)
12
+ descendants_tracker (0.0.4)
13
+ thread_safe (~> 0.3, >= 0.3.1)
14
+ docile (1.1.3)
15
+ faraday (0.9.0)
16
+ multipart-post (>= 1.2, < 3)
17
+ git (1.2.6)
18
+ github_api (0.11.3)
19
+ addressable (~> 2.3)
20
+ descendants_tracker (~> 0.0.1)
21
+ faraday (~> 0.8, < 0.10)
22
+ hashie (>= 1.2)
23
+ multi_json (>= 1.7.5, < 2.0)
24
+ nokogiri (~> 1.6.0)
25
+ oauth2
26
+ hashie (2.1.1)
27
+ highline (1.6.21)
28
+ i18n (0.6.9)
29
+ jeweler (2.0.1)
30
+ builder
31
+ bundler (>= 1.0)
32
+ git (>= 1.2.5)
33
+ github_api
34
+ highline (>= 1.6.15)
35
+ nokogiri (>= 1.5.10)
36
+ rake
37
+ rdoc
38
+ json (1.8.1)
39
+ jwt (0.1.13)
40
+ multi_json (>= 1.5)
41
+ mini_portile (0.6.0)
42
+ minitest (5.3.4)
43
+ multi_json (1.10.0)
44
+ multi_xml (0.5.5)
45
+ multipart-post (2.0.0)
46
+ nokogiri (1.6.2.1)
47
+ mini_portile (= 0.6.0)
48
+ oauth2 (0.9.3)
49
+ faraday (>= 0.8, < 0.10)
50
+ jwt (~> 0.1.8)
51
+ multi_json (~> 1.3)
52
+ multi_xml (~> 0.5)
53
+ rack (~> 1.2)
54
+ rack (1.5.2)
55
+ rake (10.3.2)
56
+ rdoc (3.12.2)
57
+ json (~> 1.4)
58
+ shoulda (3.5.0)
59
+ shoulda-context (~> 1.0, >= 1.0.1)
60
+ shoulda-matchers (>= 1.4.1, < 3.0)
61
+ shoulda-context (1.2.1)
62
+ shoulda-matchers (2.6.1)
63
+ activesupport (>= 3.0.0)
64
+ simplecov (0.8.2)
65
+ docile (~> 1.1.0)
66
+ multi_json
67
+ simplecov-html (~> 0.8.0)
68
+ simplecov-html (0.8.0)
69
+ thread_safe (0.3.3)
70
+ tzinfo (1.1.0)
71
+ thread_safe (~> 0.1)
72
+
73
+ PLATFORMS
74
+ ruby
75
+
76
+ DEPENDENCIES
77
+ bundler (~> 1.0)
78
+ jeweler (~> 2.0.1)
79
+ rdoc (~> 3.12)
80
+ shoulda
81
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2014 Mason F. Matthews
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ = distyll
2
+
3
+ Suppose that you're writing code for a project that's been in production for a long time. All said and done,
4
+ the production database size is on the order of 100 GB. When you've finished writing your new feature, you
5
+ test it on your seeds. All is well. However, how do you know that it will work on production data? How do
6
+ you know that your seeds accurately reflect the variance (and oddities) present in the production data set?
7
+
8
+ Distyll attempts to solve this by creating a "recent" subset of the production data set. This could be done
9
+ naively by taking all of the records across the whole database with a created_at above a certain time.
10
+ However, a record created today may have an associated record (via a foreign key) which was created five
11
+ years ago. If you slice the entire database by created_at timestamps, you'll have foreign keys which point
12
+ nowhere. Not very helpful for ensuring that your new feature works on production data.
13
+
14
+ Distyll's solution is to start from a set of "core" ActiveRecord models supplied at initialization time
15
+ (plus a date threshold for these models), and only pull those that have been created since the date
16
+ threshold. It then traverses all belongs_to relationships from those core models and pulls in all of those
17
+ related records.
18
+
19
+ Consequently, you end up with a data set that is representative of production, is internally consistent,
20
+ and is smaller.
21
+
22
+ == Using distyll in your project
23
+
24
+ 1. Add <code>gem 'distyll'</code> to your gemfile
25
+ 1. Run <code>bundle install</code>
26
+ 1. Add a <code>distyll:</code> database to your database.yml
27
+ 1. Run <code>rake db:create RAILS_ENV=distyll</code>
28
+ 1. Run <code>rake db:schema:load RAILS_ENV=distyll</code>
29
+ 1. Run <code>rails console</code>
30
+ 1. Call <code>Distyll.new(model_names, created_since)</code>, passing it an array of strings of the core models and a date after which core records will be copied.
31
+
32
+ If you need to clear out the distyll database and try again with different parameters, just go back to the <code>schema:load</code> step and continue from there.
33
+
34
+ == Contributing to distyll
35
+
36
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
37
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
38
+ * Fork the project.
39
+ * Start a feature/bugfix branch.
40
+ * Commit and push until you are happy with your contribution.
41
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
42
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
43
+
44
+ == Next Steps for distyll
45
+
46
+ * Distyll only traverses belongs_to associations for now. Need to consider other association types.
47
+ * Is likely to cause problems with single table inheritance. Could probably refer to table names rather than model names when traversing relationships... but this would still be an issue for the base models.
48
+ * Currently performs "IN" query. In Oracle, this is limited to 1000 values, so I would need to chunk them for that DBMS.
49
+ * Tests. I know. I just don't yet have my head around how to test something that's SO model- and database-centric, when those models and databases aren't present in the gem. Any advice would be appreciated.
50
+
51
+ == Copyright
52
+
53
+ Copyright (c) 2014 Mason F. Matthews. See LICENSE.txt for
54
+ further details.
55
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
+ gem.name = "distyll"
18
+ gem.homepage = "http://github.com/masonfmatthews/distyll"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{A gem for distilling a massive database into a recent (and smaller) data set}
21
+ gem.description = %Q{Have you ever had a 100GB production database and been unable to test on an internally consistent subset of the data? Distyll is your answer.}
22
+ gem.email = "mason.f.matthews@gmail.com"
23
+ gem.authors = ["Mason F. Matthews"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ desc "Code coverage detail"
36
+ task :simplecov do
37
+ ENV['COVERAGE'] = "true"
38
+ Rake::Task['test'].execute
39
+ end
40
+
41
+ task :default => :test
42
+
43
+ require 'rdoc/task'
44
+ Rake::RDocTask.new do |rdoc|
45
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
46
+
47
+ rdoc.rdoc_dir = 'rdoc'
48
+ rdoc.title = "distyll #{version}"
49
+ rdoc.rdoc_files.include('README*')
50
+ rdoc.rdoc_files.include('lib/**/*.rb')
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/distyll.gemspec ADDED
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "distyll"
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Mason F. Matthews"]
12
+ s.date = "2014-06-03"
13
+ s.description = "Have you ever had a 100GB production database and been unable to test on an internally consistent subset of the data? Distyll is your answer."
14
+ s.email = "mason.f.matthews@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "distyll.gemspec",
28
+ "lib/distyll.rb",
29
+ "test/helper.rb",
30
+ "test/test_distyll.rb"
31
+ ]
32
+ s.homepage = "http://github.com/masonfmatthews/distyll"
33
+ s.licenses = ["MIT"]
34
+ s.require_paths = ["lib"]
35
+ s.rubygems_version = "2.0.14"
36
+ s.summary = "A gem for distilling a massive database into a recent (and smaller) data set"
37
+
38
+ if s.respond_to? :specification_version then
39
+ s.specification_version = 4
40
+
41
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
42
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
43
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
44
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
45
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
46
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
47
+ else
48
+ s.add_dependency(%q<shoulda>, [">= 0"])
49
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
50
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
51
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
52
+ s.add_dependency(%q<simplecov>, [">= 0"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<shoulda>, [">= 0"])
56
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
57
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
58
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
59
+ s.add_dependency(%q<simplecov>, [">= 0"])
60
+ end
61
+ end
62
+
data/lib/distyll.rb ADDED
@@ -0,0 +1,120 @@
1
+ class Distyll
2
+
3
+ attr_reader :base_models, :model_profiles, :created_since
4
+
5
+ def initialize(bms, cs)
6
+ @created_since = cs.to_date
7
+ @base_models = bms.map &:constantize
8
+ set_model_profiles
9
+ end
10
+
11
+ def run
12
+ base_models.each do |model|
13
+ @model_profiles[model].load_ids_by_timestamp(@created_since)
14
+ end
15
+
16
+ prior_count = -1
17
+ while prior_count != current_count
18
+ prior_count = current_count
19
+ @model_profiles.each_value &:demote_new_ids
20
+
21
+ @model_profiles.each_value do |profile|
22
+ profile.associations.each do |a|
23
+ @model_profiles[a.klass].load_ids(profile.get_new_associated_ids(a))
24
+ end
25
+ end
26
+ end
27
+
28
+ @model_profiles.each_value do |profile|
29
+ profile.copy_records
30
+ end
31
+ end
32
+
33
+
34
+ private
35
+
36
+ def set_model_profiles
37
+ @model_profiles = Hash.new
38
+ base_models.each do |bm|
39
+ @model_profiles = potentially_add_profiles(bm, @model_profiles)
40
+ end
41
+ end
42
+
43
+ def potentially_add_profiles(model, profiles)
44
+ return profiles if profiles.include? model
45
+ profiles[model] = DistyllModelProfile.new(model)
46
+ profiles[model].associations.each do |a|
47
+ profiles = potentially_add_profiles(a.klass, profiles)
48
+ end
49
+ profiles
50
+ end
51
+
52
+ def current_count
53
+ model_profiles.each_value.sum &:get_id_count
54
+ end
55
+
56
+ end
57
+
58
+
59
+
60
+ class DistyllModelProfile
61
+ attr_reader :model, :record_count, :associations, :all_ids, :last_ids, :new_ids
62
+
63
+ def initialize(m)
64
+ @model = m
65
+ @record_count = m.count
66
+ @all_ids = Array.new
67
+ @last_ids = Array.new
68
+ @new_ids = Array.new
69
+ set_associations
70
+ end
71
+
72
+ def demote_new_ids
73
+ @last_ids = @new_ids
74
+ @new_ids = Array.new
75
+ end
76
+
77
+ def load_ids_by_timestamp(timestamp)
78
+ ids = model.where("created_at >= ?", timestamp).select(:id).map &:id
79
+ @new_ids += ids
80
+ @all_ids += ids
81
+ end
82
+
83
+ def load_ids(ids)
84
+ @new_ids += ids
85
+ @all_ids += ids
86
+ end
87
+
88
+ def get_id_count
89
+ @all_ids = @all_ids.uniq || []
90
+ @all_ids.count
91
+ end
92
+
93
+ def get_new_associated_ids(a)
94
+ model.where(id: last_ids).select(a.foreign_key).map { |r| r.send(a.foreign_key) }
95
+ end
96
+
97
+ def copy_records
98
+ return nil if all_ids.blank?
99
+
100
+ records = model.where(id: all_ids).load
101
+
102
+ model.establish_connection("distyll")
103
+ records.each { |record| model.new(record.attributes).save!(validate: false) }
104
+ model.establish_connection(Rails.env)
105
+
106
+ records
107
+ end
108
+
109
+ private
110
+
111
+ def set_associations
112
+ @associations = Array.new
113
+ model.reflect_on_all_associations.each do |association|
114
+ if association.belongs_to? && association.through_reflection.nil?
115
+ @associations << association
116
+ end
117
+ end
118
+ end
119
+
120
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'simplecov'
2
+
3
+ module SimpleCov::Configuration
4
+ def clean_filters
5
+ @filters = []
6
+ end
7
+ end
8
+
9
+ SimpleCov.configure do
10
+ clean_filters
11
+ load_adapter 'test_frameworks'
12
+ end
13
+
14
+ ENV["COVERAGE"] && SimpleCov.start do
15
+ add_filter "/.rvm/"
16
+ end
17
+ require 'rubygems'
18
+ require 'bundler'
19
+ begin
20
+ Bundler.setup(:default, :development)
21
+ rescue Bundler::BundlerError => e
22
+ $stderr.puts e.message
23
+ $stderr.puts "Run `bundle install` to install missing gems"
24
+ exit e.status_code
25
+ end
26
+ require 'test/unit'
27
+ require 'shoulda'
28
+
29
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
30
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
31
+ require 'distyll'
32
+
33
+ class Test::Unit::TestCase
34
+ end
@@ -0,0 +1,5 @@
1
+ require 'helper'
2
+
3
+ class TestDistyll < Test::Unit::TestCase
4
+
5
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: distyll
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mason F. Matthews
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rdoc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '3.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '3.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: jeweler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 2.0.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 2.0.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Have you ever had a 100GB production database and been unable to test
84
+ on an internally consistent subset of the data? Distyll is your answer.
85
+ email: mason.f.matthews@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files:
89
+ - LICENSE.txt
90
+ - README.rdoc
91
+ files:
92
+ - .document
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE.txt
96
+ - README.rdoc
97
+ - Rakefile
98
+ - VERSION
99
+ - distyll.gemspec
100
+ - lib/distyll.rb
101
+ - test/helper.rb
102
+ - test/test_distyll.rb
103
+ homepage: http://github.com/masonfmatthews/distyll
104
+ licenses:
105
+ - MIT
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 2.0.14
124
+ signing_key:
125
+ specification_version: 4
126
+ summary: A gem for distilling a massive database into a recent (and smaller) data
127
+ set
128
+ test_files: []