data_miner 1.3.8 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/CHANGELOG +42 -0
  2. data/Gemfile +19 -3
  3. data/README.rdoc +3 -3
  4. data/Rakefile +13 -15
  5. data/data_miner.gemspec +4 -15
  6. data/lib/data_miner.rb +69 -70
  7. data/lib/data_miner/active_record_extensions.rb +17 -22
  8. data/lib/data_miner/attribute.rb +176 -179
  9. data/lib/data_miner/dictionary.rb +38 -31
  10. data/lib/data_miner/run.rb +49 -18
  11. data/lib/data_miner/script.rb +116 -0
  12. data/lib/data_miner/step.rb +5 -0
  13. data/lib/data_miner/step/import.rb +74 -0
  14. data/lib/data_miner/step/process.rb +34 -0
  15. data/lib/data_miner/step/tap.rb +134 -0
  16. data/lib/data_miner/version.rb +1 -1
  17. data/test/helper.rb +26 -24
  18. data/test/support/breeds.xls +0 -0
  19. data/test/support/pet_color_dictionary.en.csv +5 -0
  20. data/test/support/pet_color_dictionary.es.csv +5 -0
  21. data/test/support/pets.csv +5 -0
  22. data/test/support/pets_funny.csv +4 -0
  23. data/test/test_data_miner.rb +103 -0
  24. data/test/test_earth_import.rb +25 -0
  25. data/test/test_earth_tap.rb +25 -0
  26. data/test/test_safety.rb +43 -0
  27. metadata +72 -78
  28. data/.document +0 -5
  29. data/lib/data_miner/config.rb +0 -124
  30. data/lib/data_miner/import.rb +0 -93
  31. data/lib/data_miner/process.rb +0 -38
  32. data/lib/data_miner/tap.rb +0 -143
  33. data/test/support/aircraft.rb +0 -102
  34. data/test/support/airport.rb +0 -16
  35. data/test/support/automobile_fuel_type.rb +0 -40
  36. data/test/support/automobile_variant.rb +0 -362
  37. data/test/support/country.rb +0 -15
  38. data/test/support/test_database.rb +0 -311
  39. data/test/test_data_miner_attribute.rb +0 -111
  40. data/test/test_data_miner_process.rb +0 -18
  41. data/test/test_old_syntax.rb +0 -825
  42. data/test/test_tap.rb +0 -21
data/.document DELETED
@@ -1,5 +0,0 @@
1
- README.rdoc
2
- lib/**/*.rb
3
- bin/*
4
- features/**/*.feature
5
- LICENSE
@@ -1,124 +0,0 @@
1
- require 'blockenspiel'
2
- require 'benchmark'
3
-
4
- class DataMiner
5
- class Config
6
- include ::Blockenspiel::DSL
7
-
8
- attr_reader :resource
9
-
10
- def initialize(resource)
11
- @resource = resource
12
- end
13
-
14
- def steps
15
- @steps ||= []
16
- end
17
-
18
- # def attributes
19
- # @attributes ||= {}
20
- # end
21
-
22
- def process(method_id_or_description, &blk)
23
- step = Process.new self, method_id_or_description, &blk
24
- steps.push step
25
- end
26
-
27
- def tap(description, source, options = {})
28
- step = Tap.new self, description, source, options
29
- steps.push step
30
- end
31
-
32
- def import(*args, &blk)
33
- if args.length == 1
34
- description = '(no description)'
35
- else
36
- description = args[0]
37
- end
38
- options = args.last
39
-
40
- step = Import.new self, description, options
41
- ::Blockenspiel.invoke blk, step
42
- steps.push step
43
- end
44
-
45
- # Mine data for this class.
46
- def run(options = {})
47
- options = options.dup
48
- options.stringify_keys!
49
-
50
- return if ::DataMiner.instance.call_stack.include? resource.name
51
- ::DataMiner.instance.call_stack.push resource.name
52
-
53
- finished = false
54
- skipped = false
55
- run = if Run.table_exists?
56
- Run.create! :started_at => ::Time.now, :resource_name => resource.name, :killed => true
57
- end
58
- resource.delete_all if options['from_scratch']
59
- begin
60
- steps.each do |step|
61
- time = ::Benchmark.realtime { step.run }
62
- resource.reset_column_information
63
- DataMiner.logger.info %{Ran #{step.inspect} in #{time.to_i}}
64
- end
65
- finished = true
66
- rescue Finish
67
- finished = true
68
- rescue Skip
69
- skipped = true
70
- ensure
71
- if Run.table_exists?
72
- run.update_attributes! :terminated_at => ::Time.now, :finished => finished, :skipped => skipped, :killed => false
73
- end
74
- if ::DataMiner.instance.call_stack.first == resource.name and !options['preserve_call_stack_between_runs']
75
- ::DataMiner.instance.call_stack.clear
76
- end
77
- end
78
- nil
79
- end
80
-
81
- def import_steps
82
- steps.select { |step| step.is_a? Import }
83
- end
84
-
85
- def before_invoke
86
-
87
- end
88
-
89
- def after_invoke
90
- return unless resource.table_exists?
91
- make_sure_unit_definitions_make_sense
92
- end
93
-
94
- COMPLETE_UNIT_DEFINITIONS = [
95
- %w{units},
96
- %w{from_units to_units},
97
- %w{units_field_name},
98
- %w{units_field_name to_units},
99
- %w{units_field_number},
100
- %w{units_field_number to_units}
101
- ]
102
-
103
- def make_sure_unit_definitions_make_sense
104
- import_steps.each do |step|
105
- step.attributes.each do |_, attribute|
106
- if attribute.options.any? { |k, _| k.to_s =~ /unit/ } and COMPLETE_UNIT_DEFINITIONS.none? { |complete_definition| complete_definition.all? { |required_option| attribute.options[required_option].present? } }
107
- raise %{
108
-
109
- ================================
110
-
111
- You don't have a valid unit definition for #{resource.name}##{attribute.name}.
112
-
113
- You supplied #{attribute.options.keys.select { |k, _| k.to_s =~ /unit/ }.inspect }.
114
-
115
- You need to supply one of #{COMPLETE_UNIT_DEFINITIONS.map(&:inspect).to_sentence}".
116
-
117
- ================================
118
- }
119
- end
120
- end
121
- end
122
- end
123
- end
124
- end
@@ -1,93 +0,0 @@
1
- require 'blockenspiel'
2
- require 'errata'
3
- require 'remote_table'
4
- class DataMiner
5
- class Import
6
- include ::Blockenspiel::DSL
7
-
8
- attr_reader :attributes
9
- attr_reader :config
10
- attr_reader :options
11
- attr_reader :description
12
-
13
- def initialize(config, description, options = {})
14
- @config = config
15
- @description = description
16
- @options = options.dup
17
- @options.stringify_keys!
18
- # legacy
19
- if @options.has_key? 'table'
20
- DataMiner.logger.warn "'table' is no longer an allowed option, taking the url from it and ignoring the rest"
21
- table_instance = @options.delete 'table'
22
- @options['url'] = table_instance.url
23
- end
24
- # legacy
25
- if @options.has_key?('errata') and not @options['errata'].is_a?(::Hash)
26
- DataMiner.logger.warn "'errata' must be a hash of Errata options. taking the URL from the Errata instance you provided and ignoring everything else"
27
- errata_instance = @options.delete 'errata'
28
- @options['errata'] = { 'url' => errata_instance.options['url'] }
29
- end
30
- end
31
-
32
- def attributes
33
- @attributes ||= ::ActiveSupport::OrderedHash.new
34
- end
35
-
36
- def resource
37
- config.resource
38
- end
39
-
40
- def inspect
41
- %{#<DataMiner::Import(#{resource}) #{description}>}
42
- end
43
-
44
- def store(attr_name, attr_options = {})
45
- raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
46
- attributes[attr_name] = Attribute.new self, attr_name, attr_options
47
- end
48
-
49
- def key(attr_name, attr_options = {})
50
- raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
51
- @_key = attr_name
52
- store attr_name, attr_options
53
- end
54
-
55
- def primary_key
56
- resource.primary_key
57
- end
58
-
59
- def table
60
- return @table if @table.is_a? ::RemoteTable
61
- # don't mess with the originals
62
- options = @options.dup
63
- options['streaming'] = true
64
- if options['errata']
65
- errata_options = options['errata'].dup
66
- errata_options.stringify_keys!
67
- errata_options['responder'] ||= resource
68
- options['errata'] = errata_options
69
- end
70
- @table = ::RemoteTable.new options
71
- end
72
-
73
- def free
74
- attributes.each { |_, attr| attr.free }
75
- @table.free if @table.is_a?(::RemoteTable)
76
- @table = nil
77
- end
78
-
79
- def run
80
- table.each do |row|
81
- record = resource.send "find_or_initialize_by_#{@_key}", attributes[@_key].value_from_row(row)
82
- attributes.each { |_, attr| attr.set_record_from_row record, row }
83
- begin
84
- record.save!
85
- rescue
86
- DataMiner.logger.warn "[data_miner] Got #{$!.inspect} when trying to save #{row}"
87
- end
88
- end
89
- free
90
- nil
91
- end
92
- end
93
- end
@@ -1,38 +0,0 @@
1
- class DataMiner
2
- class Process
3
- attr_reader :config
4
- attr_reader :method_id
5
- attr_reader :description
6
- attr_reader :blk
7
-
8
- alias :block_description :description
9
-
10
- def initialize(config, method_id_or_description, &blk)
11
- @config = config
12
- if block_given?
13
- @description = method_id_or_description
14
- @blk = blk
15
- else
16
- @description = method_id_or_description
17
- @method_id = method_id_or_description
18
- end
19
- end
20
-
21
- def resource
22
- config.resource
23
- end
24
-
25
- def inspect
26
- %{#<DataMiner::Process(#{resource}) #{description}>}
27
- end
28
-
29
- def run
30
- if blk
31
- blk.call
32
- else
33
- resource.send method_id
34
- end
35
- nil
36
- end
37
- end
38
- end
@@ -1,143 +0,0 @@
1
- require 'uri'
2
- class DataMiner
3
- # Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
4
- #
5
- # This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
6
- class Tap
7
- attr_reader :config
8
- attr_reader :description
9
- attr_reader :source
10
- attr_reader :options
11
-
12
- def initialize(config, description, source, options = {})
13
- @config = config
14
- @options = options.dup
15
- @options.stringify_keys!
16
- @description = description
17
- @source = source
18
- end
19
-
20
- def resource
21
- config.resource
22
- end
23
-
24
- def inspect
25
- %{#<DataMiner::Tap(#{resource}): #{description} (#{source})>}
26
- end
27
-
28
- def run
29
- [ source_table_name, resource.table_name ].each do |possible_obstacle|
30
- if connection.table_exists? possible_obstacle
31
- connection.drop_table possible_obstacle
32
- end
33
- end
34
- taps_pull
35
- if needs_table_rename?
36
- connection.rename_table source_table_name, resource.table_name
37
- end
38
- nil
39
- end
40
-
41
- # sabshere 1/25/11 what if there were multiple connections
42
- # blockenspiel doesn't like to delegate this to #resource
43
- def connection
44
- ::ActiveRecord::Base.connection
45
- end
46
-
47
- def db_config
48
- @db_config ||= connection.instance_variable_get(:@config).stringify_keys.merge(options.except('source_table_name'))
49
- end
50
-
51
- def source_table_name
52
- options['source_table_name'] || resource.table_name
53
- end
54
-
55
- def needs_table_rename?
56
- source_table_name != resource.table_name
57
- end
58
-
59
- def adapter
60
- case connection.adapter_name
61
- when /mysql2/i
62
- 'mysql2'
63
- when /mysql/i
64
- 'mysql'
65
- when /postgres/i
66
- 'postgres'
67
- when /sqlite/i
68
- 'sqlite'
69
- end
70
- end
71
-
72
- # never optional
73
- def database
74
- db_config['database']
75
- end
76
-
77
- DEFAULT_PORTS = {
78
- 'mysql' => 3306,
79
- 'mysql2' => 3306,
80
- 'postgres' => 5432
81
- }
82
-
83
- DEFAULT_USERNAMES = {
84
- 'mysql' => 'root',
85
- 'mysql2' => 'root',
86
- 'postgres' => ''
87
- }
88
-
89
- DEFAULT_PASSWORDS = {}
90
- DEFAULT_PASSWORDS.default = ''
91
-
92
- DEFAULT_HOSTS = {}
93
- DEFAULT_HOSTS.default = 'localhost'
94
-
95
- %w{ username password port host }.each do |x|
96
- module_eval %{
97
- def #{x}
98
- db_config['#{x}'] || DEFAULT_#{x.upcase}S[adapter]
99
- end
100
- }
101
- end
102
-
103
- # "user:pass"
104
- # "user"
105
- # nil
106
- def userinfo
107
- if username.present?
108
- [username, password].select(&:present?).join(':')
109
- end
110
- end
111
-
112
- def db_url
113
- case adapter
114
- when 'sqlite'
115
- "sqlite://#{database}"
116
- else
117
- ::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
118
- end
119
- end
120
-
121
- def taps_pull
122
- args = [
123
- 'taps',
124
- 'pull',
125
- db_url,
126
- source,
127
- '--indexes-first',
128
- '--tables',
129
- source_table_name
130
- ]
131
- child = nil
132
-
133
- # https://github.com/carlhuda/bundler/issues/1579
134
- if defined?(::Bundler)
135
- ::Bundler.with_clean_env do
136
- ::Kernel.system args.join(' ')
137
- end
138
- else
139
- ::Kernel.system args.join(' ')
140
- end
141
- end
142
- end
143
- end
@@ -1,102 +0,0 @@
1
- require 'loose_tight_dictionary'
2
-
3
- class Aircraft < ActiveRecord::Base
4
- set_primary_key :icao_code
5
- set_table_name 'aircraft'
6
-
7
- def self.bts_dictionary
8
- @_dictionary ||= LooseTightDictionary.new RemoteTable.new(:url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv', :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }),
9
- :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
10
- :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
11
- :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
12
- :left_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Model'] },
13
- :right_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
14
- end
15
-
16
- class BtsAircraftTypeCodeMatcher
17
- def match(left_record)
18
- right_record = Aircraft.bts_dictionary.left_to_right left_record
19
- right_record['Aircraft Type'] if right_record
20
- end
21
- end
22
-
23
- class BtsNameMatcher
24
- def match(left_record)
25
- right_record = Aircraft.bts_dictionary.left_to_right left_record
26
- right_record['Manufacturer'] + ' ' + right_record['Long Name'] if right_record
27
- end
28
- end
29
-
30
- class Guru
31
- # for errata
32
- def is_attributed_to_boeing?(row)
33
- row['Manufacturer'] =~ /BOEING/i
34
- end
35
-
36
- def is_not_attributed_to_airbus?(row)
37
- row['Manufacturer'] =~ /AIRBUS/i
38
- end
39
-
40
- def is_attributed_to_cessna?(row)
41
- row['Manufacturer'] =~ /CESSNA/i
42
- end
43
-
44
- def is_attributed_to_fokker?(row)
45
- row['Manufacturer'] =~ /FOKKER/i
46
- end
47
-
48
- def is_not_attributed_to_aerospatiale?(row)
49
- not row['Manufacturer'] =~ /AEROSPATIALE/i
50
- end
51
-
52
- def is_not_attributed_to_cessna?(row)
53
- not row['Manufacturer'] =~ /CESSNA/i
54
- end
55
-
56
- def is_not_attributed_to_learjet?(row)
57
- not row['Manufacturer'] =~ /LEAR/i
58
- end
59
-
60
- def is_not_attributed_to_dehavilland?(row)
61
- not row['Manufacturer'] =~ /DE ?HAVILLAND/i
62
- end
63
-
64
- def is_not_attributed_to_mcdonnell_douglas?(row)
65
- not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
66
- end
67
-
68
- def is_not_a_dc_plane?(row)
69
- not row['Model'] =~ /DC/i
70
- end
71
-
72
- def is_a_crj_900?(row)
73
- row['Designator'].downcase == 'crj9'
74
- end
75
- end
76
-
77
- data_miner do
78
- # ('A'..'Z').each do |letter|
79
- # Note: for the purposes of testing, only importing "D"
80
- %w{ D }.each do |letter|
81
- import("ICAO codes starting with letter #{letter} used by the FAA",
82
- :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
83
- :encoding => 'windows-1252',
84
- :errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => 'Aircraft::Guru' },
85
- :row_xpath => '//table/tr[2]/td/table/tr',
86
- :column_xpath => 'td') do
87
- key 'icao_code', :field_name => 'Designator'
88
- store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new, :nullify => true
89
- store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new, :nullify => true
90
- store 'manufacturer_name', :field_name => 'Manufacturer', :nullify => true
91
- store 'name', :field_name => 'Model', :nullify => true
92
- end
93
-
94
- import 'Brighter Planet aircraft class codes',
95
- :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
96
- key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
97
- store 'brighter_planet_aircraft_class_code', :nullify => true
98
- end
99
- end
100
- end
101
- end
102
-