data_miner 0.4.42 → 0.4.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -1,90 +1,265 @@
1
1
  =data_miner
2
2
 
3
- Mine remote data into your ActiveRecord models.
3
+ Programmatically import useful data into your ActiveRecord models.
4
4
 
5
- ==Quick start
6
-
7
- Put this in <tt>config/environment.rb</tt>:
5
+ (see http://wiki.github.com/seamusabshere/data_miner for more examples)
8
6
 
9
- config.gem 'data_miner'
7
+ ==Quick start
10
8
 
11
- You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
9
+ You define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
12
10
 
13
11
  class Country < ActiveRecord::Base
14
- set_primary_key :iso_3166
12
+ set_primary_key :iso_3166_code
15
13
 
16
14
  data_miner do
17
- import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
18
- key 'iso_3166'
19
- store 'iso_3166', :field_number => 1
20
- store 'name', :field_number => 0
15
+ schema do
16
+ string 'iso_3166_code'
17
+ string 'name'
21
18
  end
22
19
 
23
- import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
24
- key 'iso_3166'
25
- store 'iso_3166', :field_name => 'country code'
26
- store 'name', :field_name => 'country'
20
+ import 'the official ISO country list',
21
+ :url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
22
+ :skip => 2,
23
+ :headers => false,
24
+ :delimiter => ';',
25
+ :encoding => 'ISO-8859-1' do
26
+ key 'iso_3166_code', :field_number => 1
27
+ store 'name', :field_number => 0
27
28
  end
28
29
  end
29
30
  end
30
31
 
31
- ...and in <tt>app/models/airport.rb</tt>:
32
+ Now you can run:
32
33
 
33
- class Airport < ActiveRecord::Base
34
- set_primary_key :iata_code
35
-
36
- data_miner do
37
- import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
38
- key 'iata_code'
39
- store 'name', :field_number => 1
40
- store 'city', :field_number => 2
41
- store 'country_name', :field_number => 3
42
- store 'iata_code', :field_number => 4
43
- store 'latitude', :field_number => 6
44
- store 'longitude', :field_number => 7
45
- end
46
- end
47
- end
34
+ irb(main):001:0> Country.run_data_miner!
35
+ => nil
48
36
 
49
- Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now)
37
+ ==Advanced usage
50
38
 
51
- namespace :data_miner do
52
- task :run => :environment do
53
- resource_names = %w{R RESOURCES RESOURCE RESOURCE_NAMES}.map { |possible_key| ENV[possible_key].to_s }.join.split(/\s*,\s*/).flatten.compact
54
- DataMiner.run :resource_names => resource_names
55
- end
56
- end
57
-
58
- Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can:
39
+ This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:
59
40
 
60
- $ rake data_miner:run RESOURCES=Airport,Country
41
+ class Aircraft < ActiveRecord::Base
42
+ # Tell ActiveRecord that we want to use a string primary key.
43
+ # This makes it easier to repeatedly truncate and re-import this
44
+ # table without breaking associations.
45
+ set_primary_key :icao_code
46
+
47
+ # A dictionary between BTS aircraft type codes and ICAO aircraft
48
+ # codes that uses string similarity instead of exact matching.
49
+ # This is preferable to typing everything out.
50
+ def self.bts_name_dictionary
51
+ # Sorry for documenting the LooseTightDictionary gem here, but it's useful
52
+ @_bts_dictionary ||= LooseTightDictionary.new(
53
+ # The first argument is the source... the possible matches. Most Enumerables will do.
54
+ RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
55
+ # Tightenings optionally pull out what is important on both sides of a potential match
56
+ :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
57
+ # Identities optionally require a particular capture from both sides of a match to be equal
58
+ :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
59
+ # Blockings restrict comparisons to a subset where everything matches the blocking
60
+ :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
61
+ # This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
62
+ # This is useful because we say /boeing/ and only boeings are matched against other boeings.
63
+ :blocking_only => true,
64
+ # Tell the dictionary how read things from the source.
65
+ :right_reader => lambda { |record| record['Description'] }
66
+ )
67
+ end
68
+
69
+ # A dictionary between what appear to be ICAO aircraft names and
70
+ # objects of this class itself.
71
+ # Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
72
+ def self.icao_name_dictionary
73
+ @_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
74
+ :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
75
+ :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
76
+ :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
77
+ :right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
78
+ end
61
79
 
62
- ==Complete example
80
+ # This responds to the "Matcher" interface as defined by DataMiner.
81
+ # In other words, it takes Matcher#match(*args) and returns something.
82
+ class BtsMatcher
83
+ attr_reader :wants
84
+ def initialize(wants)
85
+ @wants = wants
86
+ end
87
+ def match(raw_faa_icao_record)
88
+ @_match ||= Hash.new
89
+ return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
90
+ faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
91
+ bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
92
+ retval = case wants
93
+ when :bts_aircraft_type_code
94
+ bts_record['Code']
95
+ when :bts_name
96
+ bts_record['Description']
97
+ end if bts_record
98
+ @_match[raw_faa_icao_record] = retval
99
+ end
100
+ end
63
101
 
64
- ~ $ rails testapp
65
- ~ $ cd testapp/
66
- ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float
67
- [...edit migration to make iata_code the primary key...]
68
- ~/testapp $ ./script/generate model Country iso_3166:string name:string
69
- [...edit migration to make iso_3166 the primary key...]
70
- ~/testapp $ rake db:migrate
71
- ~/testapp $ touch lib/tasks/data_miner_tasks.rb
72
- [...edit per quick start...]
73
- ~/testapp $ rake data_miner:run RESOURCES=Airport,Country
102
+ # Another class that implements the "Matcher" interface as expected by DataMiner.
103
+ class FuelUseMatcher
104
+ def match(raw_fuel_use_record)
105
+ @_match ||= Hash.new
106
+ return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
107
+ # First try assuming we have an ICAO code
108
+ aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
109
+ Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
110
+ end
111
+ # No luck? then try a fuzzy match
112
+ aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
113
+ Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
114
+ end
115
+ if aircraft_record
116
+ @_match[raw_fuel_use_record] = aircraft_record.icao_code
117
+ else
118
+ # While we're developing the dictionary, we want it to blow up until we have 100% matchability
119
+ raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
120
+ end
121
+ end
122
+ end
123
+
124
+ # This responds to the "Responder" interface as expected by Errata.
125
+ # Basically it lets you say "Is a DC plane" in the errata file and
126
+ # have it map to a Ruby method.
127
+ class Guru
128
+ def is_a_dc_plane?(row)
129
+ row['Designator'] =~ /^DC\d/i
130
+ end
131
+ def is_a_g159?(row)
132
+ row['Designator'] =~ /^G159$/
133
+ end
134
+ def is_a_galx?(row)
135
+ row['Designator'] =~ /^GALX$/
136
+ end
137
+ def method_missing(method_id, *args, &block)
138
+ if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
139
+ manufacturer_name = $1
140
+ manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
141
+ matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
142
+ method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
143
+ else
144
+ super
145
+ end
146
+ end
147
+ end
148
+
149
+ data_miner do
150
+ # In our app, we defined DataMiner::Run.allowed? to return false if a run
151
+ # has taken place in the last hour (among other things).
152
+ # By raising DataMiner::Skip, we skip this run but call it a success.
153
+ process "Don't re-import too often" do
154
+ raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
155
+ end
156
+
157
+ # Define the database schema in-line.
158
+ # It will destructively and automatically add/remove columns.
159
+ # This is "OK" because you can always just re-run the import script to get the data back.
160
+ # PS. if we were using DataMapper, we wouldn't need this.
161
+ schema :options => 'ENGINE=InnoDB default charset=utf8' do
162
+ string 'icao_code'
163
+ string 'manufacturer_name'
164
+ string 'name'
165
+ string 'bts_name'
166
+ string 'bts_aircraft_type_code'
167
+ string 'brighter_planet_aircraft_class_code'
168
+ string 'fuel_use_aircraft_name'
169
+ float 'm3'
170
+ string 'm3_units'
171
+ float 'm2'
172
+ string 'm2_units'
173
+ float 'm1'
174
+ string 'm1_units'
175
+ float 'endpoint_fuel'
176
+ string 'endpoint_fuel_units'
177
+ float 'seats'
178
+ float 'distance'
179
+ string 'distance_units'
180
+ float 'load_factor'
181
+ float 'freight_share'
182
+ float 'payload'
183
+ float 'weighting'
184
+ index 'bts_aircraft_type_code'
185
+ end
74
186
 
75
- Now you should have
187
+ # The FAA publishes a document to help people identify aircraft by different names.
188
+ ('A'..'Z').each do |letter|
189
+ import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
190
+ # The master URL of the source file (one for every letter)
191
+ :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
192
+ # The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
193
+ :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
194
+ # If it's not UTF-8, you should say what it is so that we can iconv it!
195
+ :encoding => 'windows-1252',
196
+ # Nokogiri is being used to grab each row starting from the second
197
+ :row_xpath => '//table/tr[2]/td/table/tr',
198
+ # ditto... XPath for Nokogiri
199
+ :column_xpath => 'td' ) do
200
+ # The code that they use is in fact the ICAO code!
201
+ key 'icao_code', :field_name => 'Designator'
202
+ # We get this for free
203
+ store 'manufacturer_name', :field_name => 'Manufacturer'
204
+ # ditto
205
+ store 'name', :field_name => 'Model'
206
+ # Use the loose-tight dictionary.
207
+ # It gets the entire input row to play with before deciding on an output.
208
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
209
+ store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
210
+ end
211
+ end
76
212
 
77
- ~/testapp $ ./script/console
78
- Loading development environment (Rails 2.3.3)
79
- >> Airport.first.iata_code
80
- => "GKA"
81
- >> Airport.first.country_name
82
- => "Papua New Guinea"
213
+ # Pull in some data that might only be important to Brighter Planet
214
+ import "Brighter Planet's aircraft class codes",
215
+ :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
216
+ key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
217
+ store 'brighter_planet_aircraft_class_code'
218
+ end
83
219
 
84
- ==Wishlist
220
+ # Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
221
+ # This data comes from the EEA.
222
+ import "pre-calculated fuel use equation coefficients",
223
+ :url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
224
+ :select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
225
+ # We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
226
+ key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new
227
+ # Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
228
+ store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
229
+ store 'm3'
230
+ store 'm2'
231
+ store 'm1'
232
+ store 'endpoint_fuel', :field_name => 'b'
233
+ end
234
+
235
+ # Use arel and the weighted_average gem to do some crazy averaging.
236
+ # This assumes that you're dealing with the BTS T-100 flight segment data.
237
+ # See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
238
+ process "Derive some average flight characteristics from flight segments" do
239
+ FlightSegment.run_data_miner!
240
+ aircraft = Aircraft.arel_table
241
+ segments = FlightSegment.arel_table
85
242
 
86
- * each_record do |record| ... which would use find_in_batches
87
- * when proxying add_column, rename_column, etc. automatically include the table name
243
+ conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
244
+ update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
245
+ update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
246
+ update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
247
+ update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
248
+ update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"
249
+
250
+ update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
251
+ end
252
+
253
+ # And finally re-run the import of resources that depend on this resource.
254
+ # Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
255
+ # that's the right way to do dependencies. It won't get called twice in the same run.
256
+ [ AircraftManufacturer ].each do |synthetic_resource|
257
+ process "Synthesize #{synthetic_resource}" do
258
+ synthetic_resource.run_data_miner!
259
+ end
260
+ end
261
+ end
262
+ end
88
263
 
89
264
  ==Authors
90
265
 
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ begin
10
10
  gem.email = "seamus@abshere.net"
11
11
  gem.homepage = "http://github.com/seamusabshere/data_miner"
12
12
  gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
- gem.add_dependency 'remote_table', '>=0.2.26'
13
+ gem.add_dependency 'remote_table', '>=0.2.27'
14
14
  gem.add_dependency 'escape', '>=0.0.4'
15
15
  gem.add_dependency 'activerecord', '>=2.3.4'
16
16
  gem.add_dependency 'activesupport', '>=2.3.4'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.42
1
+ 0.4.43
data/data_miner.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{data_miner}
8
- s.version = "0.4.42"
8
+ s.version = "0.4.43"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-06-08}
12
+ s.date = %q{2010-06-30}
13
13
  s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -52,7 +52,7 @@ Gem::Specification.new do |s|
52
52
  s.specification_version = 3
53
53
 
54
54
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55
- s.add_runtime_dependency(%q<remote_table>, [">= 0.2.26"])
55
+ s.add_runtime_dependency(%q<remote_table>, [">= 0.2.27"])
56
56
  s.add_runtime_dependency(%q<escape>, [">= 0.0.4"])
57
57
  s.add_runtime_dependency(%q<activerecord>, [">= 2.3.4"])
58
58
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.4"])
@@ -63,7 +63,7 @@ Gem::Specification.new do |s|
63
63
  s.add_runtime_dependency(%q<errata>, [">= 0.2.1"])
64
64
  s.add_runtime_dependency(%q<taps>, [">= 0.3.5"])
65
65
  else
66
- s.add_dependency(%q<remote_table>, [">= 0.2.26"])
66
+ s.add_dependency(%q<remote_table>, [">= 0.2.27"])
67
67
  s.add_dependency(%q<escape>, [">= 0.0.4"])
68
68
  s.add_dependency(%q<activerecord>, [">= 2.3.4"])
69
69
  s.add_dependency(%q<activesupport>, [">= 2.3.4"])
@@ -75,7 +75,7 @@ Gem::Specification.new do |s|
75
75
  s.add_dependency(%q<taps>, [">= 0.3.5"])
76
76
  end
77
77
  else
78
- s.add_dependency(%q<remote_table>, [">= 0.2.26"])
78
+ s.add_dependency(%q<remote_table>, [">= 0.2.27"])
79
79
  s.add_dependency(%q<escape>, [">= 0.0.4"])
80
80
  s.add_dependency(%q<activerecord>, [">= 2.3.4"])
81
81
  s.add_dependency(%q<activesupport>, [">= 2.3.4"])
@@ -186,6 +186,8 @@ On the other hand, if you're working directly with create_table, this might be h
186
186
  resource_name.constantize.data_miner_base.run options
187
187
  end
188
188
  end
189
+ ensure
190
+ RemoteTable.cleanup
189
191
  end
190
192
  end
191
193
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- hash: 91
4
+ hash: 89
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 4
9
- - 42
10
- version: 0.4.42
9
+ - 43
10
+ version: 0.4.43
11
11
  platform: ruby
12
12
  authors:
13
13
  - Seamus Abshere
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-06-08 00:00:00 -04:00
19
+ date: 2010-06-30 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies:
22
22
  - !ruby/object:Gem::Dependency
@@ -27,12 +27,12 @@ dependencies:
27
27
  requirements:
28
28
  - - ">="
29
29
  - !ruby/object:Gem::Version
30
- hash: 35
30
+ hash: 33
31
31
  segments:
32
32
  - 0
33
33
  - 2
34
- - 26
35
- version: 0.2.26
34
+ - 27
35
+ version: 0.2.27
36
36
  type: :runtime
37
37
  version_requirements: *id001
38
38
  - !ruby/object:Gem::Dependency