data_miner 0.4.26 → 0.4.27

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -18,6 +18,7 @@ begin
18
18
  gem.add_dependency 'blockenspiel', '>=0.3.2'
19
19
  gem.add_dependency 'log4r', '>=1.1.7'
20
20
  gem.add_dependency 'errata', '>=0.2.1'
21
+ gem.add_dependency 'taps', '>=0.3.5'
21
22
  gem.add_development_dependency "loose_tight_dictionary", ">=0.0.5"
22
23
  gem.require_path = "lib"
23
24
  gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.26
1
+ 0.4.27
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{data_miner}
8
- s.version = "0.4.26"
8
+ s.version = "0.4.27"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-05-06}
12
+ s.date = %q{2010-05-07}
13
13
  s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -27,12 +27,12 @@ Gem::Specification.new do |s|
27
27
  "data_miner.gemspec",
28
28
  "lib/data_miner.rb",
29
29
  "lib/data_miner/attribute.rb",
30
- "lib/data_miner/clone.rb",
31
30
  "lib/data_miner/configuration.rb",
32
31
  "lib/data_miner/dictionary.rb",
33
32
  "lib/data_miner/import.rb",
34
33
  "lib/data_miner/process.rb",
35
34
  "lib/data_miner/run.rb",
35
+ "lib/data_miner/tap.rb",
36
36
  "test/data_miner_test.rb",
37
37
  "test/test_helper.rb"
38
38
  ]
@@ -60,6 +60,7 @@ Gem::Specification.new do |s|
60
60
  s.add_runtime_dependency(%q<blockenspiel>, [">= 0.3.2"])
61
61
  s.add_runtime_dependency(%q<log4r>, [">= 1.1.7"])
62
62
  s.add_runtime_dependency(%q<errata>, [">= 0.2.1"])
63
+ s.add_runtime_dependency(%q<taps>, [">= 0.3.5"])
63
64
  s.add_development_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
64
65
  else
65
66
  s.add_dependency(%q<remote_table>, [">= 0.2.20"])
@@ -70,6 +71,7 @@ Gem::Specification.new do |s|
70
71
  s.add_dependency(%q<blockenspiel>, [">= 0.3.2"])
71
72
  s.add_dependency(%q<log4r>, [">= 1.1.7"])
72
73
  s.add_dependency(%q<errata>, [">= 0.2.1"])
74
+ s.add_dependency(%q<taps>, [">= 0.3.5"])
73
75
  s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
74
76
  end
75
77
  else
@@ -81,6 +83,7 @@ Gem::Specification.new do |s|
81
83
  s.add_dependency(%q<blockenspiel>, [">= 0.3.2"])
82
84
  s.add_dependency(%q<log4r>, [">= 1.1.7"])
83
85
  s.add_dependency(%q<errata>, [">= 0.2.1"])
86
+ s.add_dependency(%q<taps>, [">= 0.3.5"])
84
87
  s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
85
88
  end
86
89
  end
@@ -22,8 +22,8 @@ require 'data_miner/attribute'
22
22
  require 'data_miner/configuration'
23
23
  require 'data_miner/dictionary'
24
24
  require 'data_miner/import'
25
+ require 'data_miner/tap'
25
26
  require 'data_miner/process'
26
- require 'data_miner/clone'
27
27
  require 'data_miner/run'
28
28
 
29
29
  module DataMiner
@@ -69,6 +69,24 @@ module DataMiner
69
69
  def self.resource_names
70
70
  DataMiner::Configuration.resource_names
71
71
  end
72
+
73
+ # TODO this should probably live somewhere else
74
+ def self.backtick_with_reporting(cmd, raise_on_error = true)
75
+ cmd = cmd.gsub /\s+/m, ' '
76
+ output = `#{cmd}`
77
+ if raise_on_error and not $?.success?
78
+ raise %{
79
+ From the data_miner gem...
80
+
81
+ Command failed:
82
+ #{cmd}
83
+
84
+ Output:
85
+ #{output}
86
+ }
87
+ end
88
+ end
89
+
72
90
  end
73
91
 
74
92
  ActiveRecord::Base.class_eval do
@@ -16,8 +16,8 @@ module DataMiner
16
16
  self.step_counter += 1
17
17
  end
18
18
 
19
- def clone(description, options = {})
20
- steps << DataMiner::Clone.new(self, step_counter, description, options)
19
+ def tap(description, source, options = {})
20
+ steps << DataMiner::Tap.new(self, step_counter, description, source, options)
21
21
  self.step_counter += 1
22
22
  end
23
23
 
@@ -0,0 +1,137 @@
1
+
2
+ module DataMiner
3
+ class Tap
4
+ attr_reader :configuration
5
+ attr_reader :position_in_run
6
+ attr_reader :description
7
+ attr_reader :source
8
+ attr_reader :options
9
+ delegate :resource, :to => :configuration
10
+
11
+ def initialize(configuration, position_in_run, description, source, options = {})
12
+ options.symbolize_keys!
13
+ DataMiner.log_or_raise "Tap has to be the first step." unless position_in_run == 0
14
+ @configuration = configuration
15
+ @position_in_run = position_in_run
16
+ @description = description
17
+ @source = source
18
+ @options = options
19
+ end
20
+
21
+ def inspect
22
+ "Tap(#{resource}): #{description} (#{source})"
23
+ end
24
+
25
+ def run(run)
26
+ [ source_table_name, resource.table_name ].each do |possible_obstacle|
27
+ if connection.table_exists?(possible_obstacle)
28
+ connection.drop_table possible_obstacle
29
+ end
30
+ end
31
+ DataMiner.backtick_with_reporting taps_pull_cmd
32
+ if needs_table_rename?
33
+ connection.rename_table source_table_name, resource.table_name
34
+ end
35
+ DataMiner.log_info "ran #{inspect}"
36
+ end
37
+
38
+ private
39
+
40
+ def connection
41
+ ActiveRecord::Base.connection
42
+ end
43
+
44
+ def db_config
45
+ @_db_config ||= connection.instance_variable_get(:@config).dup.merge(options.except(:source_table_name))
46
+ end
47
+
48
+ def source_table_name
49
+ options[:source_table_name] || resource.table_name
50
+ end
51
+
52
+ def needs_table_rename?
53
+ source_table_name != resource.table_name
54
+ end
55
+
56
+ def adapter
57
+ case connection.adapter_name
58
+ when /mysql/i
59
+ 'mysql'
60
+ when /postgres/i
61
+ 'postgres'
62
+ when /sqlite/i
63
+ 'sqlite'
64
+ end
65
+ end
66
+
67
+ # never optional
68
+ def database
69
+ db_config[:database]
70
+ end
71
+
72
+ DEFAULT_PORTS = {
73
+ 'mysql' => 3306,
74
+ 'postgres' => 5432
75
+ }
76
+
77
+ DEFAULT_USERNAMES = {
78
+ 'mysql' => 'root',
79
+ 'postgres' => ''
80
+ }
81
+
82
+ DEFAULT_PASSWORDS = {}
83
+ DEFAULT_PASSWORDS.default = ''
84
+
85
+ DEFAULT_HOSTS = {}
86
+ DEFAULT_HOSTS.default = 'localhost'
87
+
88
+ %w{ username password port host }.each do |x|
89
+ module_eval %{
90
+ def #{x}
91
+ db_config[:#{x}] || DEFAULT_#{x.upcase}S[adapter]
92
+ end
93
+ }
94
+ end
95
+
96
+ def db_locator
97
+ case adapter
98
+ when 'mysql', 'postgres'
99
+ "#{username}:#{password}@#{host}:#{port}/#{database}"
100
+ when 'sqlite'
101
+ database
102
+ end
103
+ end
104
+
105
+ # taps pull mysql://root:password@localhost/taps_test http://foo:bar@data.brighterplanet.com:5000 --tables aircraft
106
+ def taps_pull_cmd
107
+ "taps pull #{adapter}://#{db_locator} #{source} --tables #{source_table_name}"
108
+ end
109
+
110
+ # 2.3.5 mysql
111
+ # * <tt>:host</tt> - Defaults to "localhost".
112
+ # * <tt>:port</tt> - Defaults to 3306.
113
+ # * <tt>:socket</tt> - Defaults to "/tmp/mysql.sock".
114
+ # * <tt>:username</tt> - Defaults to "root"
115
+ # * <tt>:password</tt> - Defaults to nothing.
116
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
117
+ # * <tt>:encoding</tt> - (Optional) Sets the client encoding by executing "SET NAMES <encoding>" after connection.
118
+ # * <tt>:reconnect</tt> - Defaults to false (See MySQL documentation: http://dev.mysql.com/doc/refman/5.0/en/auto-reconnect.html).
119
+ # * <tt>:sslca</tt> - Necessary to use MySQL with an SSL connection.
120
+ # * <tt>:sslkey</tt> - Necessary to use MySQL with an SSL connection.
121
+ # * <tt>:sslcert</tt> - Necessary to use MySQL with an SSL connection.
122
+ # * <tt>:sslcapath</tt> - Necessary to use MySQL with an SSL connection.
123
+ # * <tt>:sslcipher</tt> - Necessary to use MySQL with an SSL connection.
124
+ # 2.3.5 mysql
125
+ # * <tt>:host</tt> - Defaults to "localhost".
126
+ # * <tt>:port</tt> - Defaults to 5432.
127
+ # * <tt>:username</tt> - Defaults to nothing.
128
+ # * <tt>:password</tt> - Defaults to nothing.
129
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
130
+ # * <tt>:schema_search_path</tt> - An optional schema search path for the connection given as a string of comma-separated schema names. This is backward-compatible with the <tt>:schema_order</tt> option.
131
+ # * <tt>:encoding</tt> - An optional client encoding that is used in a <tt>SET client_encoding TO <encoding></tt> call on the connection.
132
+ # * <tt>:min_messages</tt> - An optional client min messages that is used in a <tt>SET client_min_messages TO <min_messages></tt> call on the connection.
133
+ # * <tt>:allow_concurrency</tt> - If true, use async query methods so Ruby threads don't deadlock; otherwise, use blocking query methods.
134
+ # 2.3.5 sqlite[3]
135
+ # * <tt>:database</tt> - Path to the database file.
136
+ end
137
+ end
@@ -441,21 +441,12 @@ class Airport < ActiveRecord::Base
441
441
  end
442
442
  end
443
443
 
444
- class ClonedAirport < ActiveRecord::Base
444
+ class TappedAirport < ActiveRecord::Base
445
445
  set_primary_key :iata_code
446
- set_table_name 'airports'
447
446
 
448
447
  data_miner do
449
- clone 'a sanitized airports table', :url => 'http://data.brighterplanet.com/airports.sql', :sanity_check => true
450
- end
451
- end
452
-
453
- class BadlyClonedAirport < ActiveRecord::Base
454
- set_primary_key :iata_code
455
- set_table_name 'badly_cloned_airports'
456
-
457
- data_miner do
458
- clone 'a sanitized airports table', :url => 'http://data.brighterplanet.com/airports.sql', :sanity_check => true
448
+ tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@data.brighterplanet.com:5001", :source_table_name => 'airports'
449
+ # tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@localhost:5000", :source_table_name => 'airports'
459
450
  end
460
451
  end
461
452
 
@@ -1063,7 +1054,7 @@ class AircraftDeux < ActiveRecord::Base
1063
1054
  %w{ D }.each do |letter|
1064
1055
  import("ICAO codes starting with letter #{letter} used by the FAA",
1065
1056
  :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1066
- :encoding => 'US-ASCII',
1057
+ :encoding => 'windows-1252',
1067
1058
  :errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1068
1059
  :row_xpath => '//table/tr[2]/td/table/tr',
1069
1060
  :column_xpath => 'td') do
@@ -1094,23 +1085,17 @@ class DataMinerTest < Test::Unit::TestCase
1094
1085
  assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
1095
1086
  assert CrosscallingCensusRegion.exists? :name => 'West Region', :number => 4
1096
1087
  end
1097
-
1098
- should "clone airports" do
1099
- ClonedAirport.run_data_miner!
1100
- assert ClonedAirport.count > 0
1101
- end
1102
-
1103
- should "raise an error when the SQL for cloning doesn't seem to match up" do
1104
- assert_raises(RuntimeError) do
1105
- BadlyClonedAirport.run_data_miner!
1106
- end
1107
- end
1108
-
1088
+
1109
1089
  should "import airports" do
1110
1090
  Airport.run_data_miner!
1111
1091
  assert Airport.count > 0
1112
1092
  end
1113
1093
 
1094
+ should "tap airports" do
1095
+ TappedAirport.run_data_miner!
1096
+ assert TappedAirport.count > 0
1097
+ end
1098
+
1114
1099
  should "pull in census divisions using a data.brighterplanet.com dictionary" do
1115
1100
  CensusDivision.run_data_miner!
1116
1101
  assert CensusDivision.count > 0
@@ -86,9 +86,9 @@ ActiveRecord::Schema.define(:version => 20090819143429) do
86
86
  t.integer 'data_miner_last_run_id'
87
87
  end
88
88
  execute 'ALTER TABLE t100_flight_segments ADD PRIMARY KEY (row_hash);'
89
-
90
- create_table 'badly_cloned_airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
91
- t.string 'placeholder'
89
+
90
+ create_table 'tapped_airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
91
+ t.string 'i_am_just_here_to_get_in_the_way'
92
92
  end
93
93
 
94
94
  create_table 'airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 4
8
- - 26
9
- version: 0.4.26
8
+ - 27
9
+ version: 0.4.27
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-05-06 00:00:00 -04:00
18
+ date: 2010-05-07 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -131,9 +131,23 @@ dependencies:
131
131
  type: :runtime
132
132
  version_requirements: *id008
133
133
  - !ruby/object:Gem::Dependency
134
- name: loose_tight_dictionary
134
+ name: taps
135
135
  prerelease: false
136
136
  requirement: &id009 !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ segments:
141
+ - 0
142
+ - 3
143
+ - 5
144
+ version: 0.3.5
145
+ type: :runtime
146
+ version_requirements: *id009
147
+ - !ruby/object:Gem::Dependency
148
+ name: loose_tight_dictionary
149
+ prerelease: false
150
+ requirement: &id010 !ruby/object:Gem::Requirement
137
151
  requirements:
138
152
  - - ">="
139
153
  - !ruby/object:Gem::Version
@@ -143,7 +157,7 @@ dependencies:
143
157
  - 5
144
158
  version: 0.0.5
145
159
  type: :development
146
- version_requirements: *id009
160
+ version_requirements: *id010
147
161
  description: Mine remote data into your ActiveRecord models. You can also perform associations and convert units.
148
162
  email: seamus@abshere.net
149
163
  executables: []
@@ -164,12 +178,12 @@ files:
164
178
  - data_miner.gemspec
165
179
  - lib/data_miner.rb
166
180
  - lib/data_miner/attribute.rb
167
- - lib/data_miner/clone.rb
168
181
  - lib/data_miner/configuration.rb
169
182
  - lib/data_miner/dictionary.rb
170
183
  - lib/data_miner/import.rb
171
184
  - lib/data_miner/process.rb
172
185
  - lib/data_miner/run.rb
186
+ - lib/data_miner/tap.rb
173
187
  - test/data_miner_test.rb
174
188
  - test/test_helper.rb
175
189
  has_rdoc: true
@@ -1,86 +0,0 @@
1
- module DataMiner
2
- class Clone
3
- attr_accessor :configuration
4
- attr_accessor :position_in_run
5
- attr_accessor :description
6
- attr_accessor :options
7
- delegate :resource, :to => :configuration
8
-
9
- def initialize(configuration, position_in_run, description, options = {})
10
- DataMiner.log_or_raise "Clone has to be the first step." unless position_in_run == 0
11
- DataMiner.log_or_raise "Clone needs :url" unless options[:url].present?
12
- @configuration = configuration
13
- @position_in_run = position_in_run
14
- @description = description
15
- @options = options
16
- end
17
-
18
- def inspect
19
- "Clone(#{resource}): #{description}"
20
- end
21
-
22
- def run(run)
23
- download_sql_source
24
- perform_sanity_check unless options[:sanity_check] == false
25
- execute_sql_source
26
- DataMiner.log_info "ran #{inspect}"
27
- end
28
-
29
- private
30
-
31
- def tempfile_path
32
- return @_tempfile_path if @_tempfile_path
33
- @_tempfile_path = File.join Dir.tmpdir, rand.to_s
34
- at_exit { FileUtils.rm_f @_tempfile_path }
35
- @_tempfile_path
36
- end
37
-
38
- def download_sql_source
39
- cmd = %{
40
- curl \
41
- --silent \
42
- --header "Expect: " \
43
- --location \
44
- "#{options[:url]}" \
45
- --output "#{tempfile_path}"
46
- }
47
- `#{cmd}`
48
- end
49
-
50
- def perform_sanity_check
51
- File.open(tempfile_path, 'r') do |infile|
52
- while (line = infile.gets)
53
- line_essence = line.gsub /[^\-\_\.a-zA-Z0-9]+/, ' '
54
- if line_essence =~ /(INSERT\s+INTO|CREATE\s+TABLE|ALTER\s+TABLE|DROP\s+TABLE\s+[^I]|DROP\s+TABLE\s+IF\s+EXISTS)\s+([^\s]+)/i
55
- one = $1
56
- two = $2
57
- unless two.split('.').last == resource.table_name
58
- DataMiner.log_or_raise %{
59
-
60
- Warning: clone SQL tries to #{one} on `#{two}` instead of `#{resource.table_name}`. (#{line[0,100]}...)
61
-
62
- If you want to ignore this, use clone 'X', :url => 'Y', :sanity_check => false
63
-
64
- If you need to set a different table name, you could say set_table_name '#{two}' in your ActiveRecord model.
65
- }
66
- end
67
- end
68
- end
69
- end
70
- end
71
-
72
- def execute_sql_source
73
- mysql_config = ActiveRecord::Base.connection.instance_variable_get :@config
74
- cmd = %{
75
- mysql \
76
- --batch \
77
- #{"--host=\"#{mysql_config[:hostname]}\"" if mysql_config[:hostname].present?} \
78
- --user="#{mysql_config[:username]}" \
79
- --password="#{mysql_config[:password]}" \
80
- --database="#{mysql_config[:database]}" \
81
- --execute="SOURCE #{tempfile_path}"
82
- }
83
- `#{cmd}`
84
- end
85
- end
86
- end