data_miner 1.3.8 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/CHANGELOG +42 -0
  2. data/Gemfile +19 -3
  3. data/README.rdoc +3 -3
  4. data/Rakefile +13 -15
  5. data/data_miner.gemspec +4 -15
  6. data/lib/data_miner.rb +69 -70
  7. data/lib/data_miner/active_record_extensions.rb +17 -22
  8. data/lib/data_miner/attribute.rb +176 -179
  9. data/lib/data_miner/dictionary.rb +38 -31
  10. data/lib/data_miner/run.rb +49 -18
  11. data/lib/data_miner/script.rb +116 -0
  12. data/lib/data_miner/step.rb +5 -0
  13. data/lib/data_miner/step/import.rb +74 -0
  14. data/lib/data_miner/step/process.rb +34 -0
  15. data/lib/data_miner/step/tap.rb +134 -0
  16. data/lib/data_miner/version.rb +1 -1
  17. data/test/helper.rb +26 -24
  18. data/test/support/breeds.xls +0 -0
  19. data/test/support/pet_color_dictionary.en.csv +5 -0
  20. data/test/support/pet_color_dictionary.es.csv +5 -0
  21. data/test/support/pets.csv +5 -0
  22. data/test/support/pets_funny.csv +4 -0
  23. data/test/test_data_miner.rb +103 -0
  24. data/test/test_earth_import.rb +25 -0
  25. data/test/test_earth_tap.rb +25 -0
  26. data/test/test_safety.rb +43 -0
  27. metadata +72 -78
  28. data/.document +0 -5
  29. data/lib/data_miner/config.rb +0 -124
  30. data/lib/data_miner/import.rb +0 -93
  31. data/lib/data_miner/process.rb +0 -38
  32. data/lib/data_miner/tap.rb +0 -143
  33. data/test/support/aircraft.rb +0 -102
  34. data/test/support/airport.rb +0 -16
  35. data/test/support/automobile_fuel_type.rb +0 -40
  36. data/test/support/automobile_variant.rb +0 -362
  37. data/test/support/country.rb +0 -15
  38. data/test/support/test_database.rb +0 -311
  39. data/test/test_data_miner_attribute.rb +0 -111
  40. data/test/test_data_miner_process.rb +0 -18
  41. data/test/test_old_syntax.rb +0 -825
  42. data/test/test_tap.rb +0 -21
data/.document DELETED
@@ -1,5 +0,0 @@
1
- README.rdoc
2
- lib/**/*.rb
3
- bin/*
4
- features/**/*.feature
5
- LICENSE
@@ -1,124 +0,0 @@
1
- require 'blockenspiel'
2
- require 'benchmark'
3
-
4
- class DataMiner
5
- class Config
6
- include ::Blockenspiel::DSL
7
-
8
- attr_reader :resource
9
-
10
- def initialize(resource)
11
- @resource = resource
12
- end
13
-
14
- def steps
15
- @steps ||= []
16
- end
17
-
18
- # def attributes
19
- # @attributes ||= {}
20
- # end
21
-
22
- def process(method_id_or_description, &blk)
23
- step = Process.new self, method_id_or_description, &blk
24
- steps.push step
25
- end
26
-
27
- def tap(description, source, options = {})
28
- step = Tap.new self, description, source, options
29
- steps.push step
30
- end
31
-
32
- def import(*args, &blk)
33
- if args.length == 1
34
- description = '(no description)'
35
- else
36
- description = args[0]
37
- end
38
- options = args.last
39
-
40
- step = Import.new self, description, options
41
- ::Blockenspiel.invoke blk, step
42
- steps.push step
43
- end
44
-
45
- # Mine data for this class.
46
- def run(options = {})
47
- options = options.dup
48
- options.stringify_keys!
49
-
50
- return if ::DataMiner.instance.call_stack.include? resource.name
51
- ::DataMiner.instance.call_stack.push resource.name
52
-
53
- finished = false
54
- skipped = false
55
- run = if Run.table_exists?
56
- Run.create! :started_at => ::Time.now, :resource_name => resource.name, :killed => true
57
- end
58
- resource.delete_all if options['from_scratch']
59
- begin
60
- steps.each do |step|
61
- time = ::Benchmark.realtime { step.run }
62
- resource.reset_column_information
63
- DataMiner.logger.info %{Ran #{step.inspect} in #{time.to_i}}
64
- end
65
- finished = true
66
- rescue Finish
67
- finished = true
68
- rescue Skip
69
- skipped = true
70
- ensure
71
- if Run.table_exists?
72
- run.update_attributes! :terminated_at => ::Time.now, :finished => finished, :skipped => skipped, :killed => false
73
- end
74
- if ::DataMiner.instance.call_stack.first == resource.name and !options['preserve_call_stack_between_runs']
75
- ::DataMiner.instance.call_stack.clear
76
- end
77
- end
78
- nil
79
- end
80
-
81
- def import_steps
82
- steps.select { |step| step.is_a? Import }
83
- end
84
-
85
- def before_invoke
86
-
87
- end
88
-
89
- def after_invoke
90
- return unless resource.table_exists?
91
- make_sure_unit_definitions_make_sense
92
- end
93
-
94
- COMPLETE_UNIT_DEFINITIONS = [
95
- %w{units},
96
- %w{from_units to_units},
97
- %w{units_field_name},
98
- %w{units_field_name to_units},
99
- %w{units_field_number},
100
- %w{units_field_number to_units}
101
- ]
102
-
103
- def make_sure_unit_definitions_make_sense
104
- import_steps.each do |step|
105
- step.attributes.each do |_, attribute|
106
- if attribute.options.any? { |k, _| k.to_s =~ /unit/ } and COMPLETE_UNIT_DEFINITIONS.none? { |complete_definition| complete_definition.all? { |required_option| attribute.options[required_option].present? } }
107
- raise %{
108
-
109
- ================================
110
-
111
- You don't have a valid unit definition for #{resource.name}##{attribute.name}.
112
-
113
- You supplied #{attribute.options.keys.select { |k, _| k.to_s =~ /unit/ }.inspect }.
114
-
115
- You need to supply one of #{COMPLETE_UNIT_DEFINITIONS.map(&:inspect).to_sentence}".
116
-
117
- ================================
118
- }
119
- end
120
- end
121
- end
122
- end
123
- end
124
- end
@@ -1,93 +0,0 @@
1
- require 'blockenspiel'
2
- require 'errata'
3
- require 'remote_table'
4
- class DataMiner
5
- class Import
6
- include ::Blockenspiel::DSL
7
-
8
- attr_reader :attributes
9
- attr_reader :config
10
- attr_reader :options
11
- attr_reader :description
12
-
13
- def initialize(config, description, options = {})
14
- @config = config
15
- @description = description
16
- @options = options.dup
17
- @options.stringify_keys!
18
- # legacy
19
- if @options.has_key? 'table'
20
- DataMiner.logger.warn "'table' is no longer an allowed option, taking the url from it and ignoring the rest"
21
- table_instance = @options.delete 'table'
22
- @options['url'] = table_instance.url
23
- end
24
- # legacy
25
- if @options.has_key?('errata') and not @options['errata'].is_a?(::Hash)
26
- DataMiner.logger.warn "'errata' must be a hash of Errata options. taking the URL from the Errata instance you provided and ignoring everything else"
27
- errata_instance = @options.delete 'errata'
28
- @options['errata'] = { 'url' => errata_instance.options['url'] }
29
- end
30
- end
31
-
32
- def attributes
33
- @attributes ||= ::ActiveSupport::OrderedHash.new
34
- end
35
-
36
- def resource
37
- config.resource
38
- end
39
-
40
- def inspect
41
- %{#<DataMiner::Import(#{resource}) #{description}>}
42
- end
43
-
44
- def store(attr_name, attr_options = {})
45
- raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
46
- attributes[attr_name] = Attribute.new self, attr_name, attr_options
47
- end
48
-
49
- def key(attr_name, attr_options = {})
50
- raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
51
- @_key = attr_name
52
- store attr_name, attr_options
53
- end
54
-
55
- def primary_key
56
- resource.primary_key
57
- end
58
-
59
- def table
60
- return @table if @table.is_a? ::RemoteTable
61
- # don't mess with the originals
62
- options = @options.dup
63
- options['streaming'] = true
64
- if options['errata']
65
- errata_options = options['errata'].dup
66
- errata_options.stringify_keys!
67
- errata_options['responder'] ||= resource
68
- options['errata'] = errata_options
69
- end
70
- @table = ::RemoteTable.new options
71
- end
72
-
73
- def free
74
- attributes.each { |_, attr| attr.free }
75
- @table.free if @table.is_a?(::RemoteTable)
76
- @table = nil
77
- end
78
-
79
- def run
80
- table.each do |row|
81
- record = resource.send "find_or_initialize_by_#{@_key}", attributes[@_key].value_from_row(row)
82
- attributes.each { |_, attr| attr.set_record_from_row record, row }
83
- begin
84
- record.save!
85
- rescue
86
- DataMiner.logger.warn "[data_miner] Got #{$!.inspect} when trying to save #{row}"
87
- end
88
- end
89
- free
90
- nil
91
- end
92
- end
93
- end
@@ -1,38 +0,0 @@
1
- class DataMiner
2
- class Process
3
- attr_reader :config
4
- attr_reader :method_id
5
- attr_reader :description
6
- attr_reader :blk
7
-
8
- alias :block_description :description
9
-
10
- def initialize(config, method_id_or_description, &blk)
11
- @config = config
12
- if block_given?
13
- @description = method_id_or_description
14
- @blk = blk
15
- else
16
- @description = method_id_or_description
17
- @method_id = method_id_or_description
18
- end
19
- end
20
-
21
- def resource
22
- config.resource
23
- end
24
-
25
- def inspect
26
- %{#<DataMiner::Process(#{resource}) #{description}>}
27
- end
28
-
29
- def run
30
- if blk
31
- blk.call
32
- else
33
- resource.send method_id
34
- end
35
- nil
36
- end
37
- end
38
- end
@@ -1,143 +0,0 @@
1
- require 'uri'
2
- class DataMiner
3
- # Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
4
- #
5
- # This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
6
- class Tap
7
- attr_reader :config
8
- attr_reader :description
9
- attr_reader :source
10
- attr_reader :options
11
-
12
- def initialize(config, description, source, options = {})
13
- @config = config
14
- @options = options.dup
15
- @options.stringify_keys!
16
- @description = description
17
- @source = source
18
- end
19
-
20
- def resource
21
- config.resource
22
- end
23
-
24
- def inspect
25
- %{#<DataMiner::Tap(#{resource}): #{description} (#{source})>}
26
- end
27
-
28
- def run
29
- [ source_table_name, resource.table_name ].each do |possible_obstacle|
30
- if connection.table_exists? possible_obstacle
31
- connection.drop_table possible_obstacle
32
- end
33
- end
34
- taps_pull
35
- if needs_table_rename?
36
- connection.rename_table source_table_name, resource.table_name
37
- end
38
- nil
39
- end
40
-
41
- # sabshere 1/25/11 what if there were multiple connections
42
- # blockenspiel doesn't like to delegate this to #resource
43
- def connection
44
- ::ActiveRecord::Base.connection
45
- end
46
-
47
- def db_config
48
- @db_config ||= connection.instance_variable_get(:@config).stringify_keys.merge(options.except('source_table_name'))
49
- end
50
-
51
- def source_table_name
52
- options['source_table_name'] || resource.table_name
53
- end
54
-
55
- def needs_table_rename?
56
- source_table_name != resource.table_name
57
- end
58
-
59
- def adapter
60
- case connection.adapter_name
61
- when /mysql2/i
62
- 'mysql2'
63
- when /mysql/i
64
- 'mysql'
65
- when /postgres/i
66
- 'postgres'
67
- when /sqlite/i
68
- 'sqlite'
69
- end
70
- end
71
-
72
- # never optional
73
- def database
74
- db_config['database']
75
- end
76
-
77
- DEFAULT_PORTS = {
78
- 'mysql' => 3306,
79
- 'mysql2' => 3306,
80
- 'postgres' => 5432
81
- }
82
-
83
- DEFAULT_USERNAMES = {
84
- 'mysql' => 'root',
85
- 'mysql2' => 'root',
86
- 'postgres' => ''
87
- }
88
-
89
- DEFAULT_PASSWORDS = {}
90
- DEFAULT_PASSWORDS.default = ''
91
-
92
- DEFAULT_HOSTS = {}
93
- DEFAULT_HOSTS.default = 'localhost'
94
-
95
- %w{ username password port host }.each do |x|
96
- module_eval %{
97
- def #{x}
98
- db_config['#{x}'] || DEFAULT_#{x.upcase}S[adapter]
99
- end
100
- }
101
- end
102
-
103
- # "user:pass"
104
- # "user"
105
- # nil
106
- def userinfo
107
- if username.present?
108
- [username, password].select(&:present?).join(':')
109
- end
110
- end
111
-
112
- def db_url
113
- case adapter
114
- when 'sqlite'
115
- "sqlite://#{database}"
116
- else
117
- ::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
118
- end
119
- end
120
-
121
- def taps_pull
122
- args = [
123
- 'taps',
124
- 'pull',
125
- db_url,
126
- source,
127
- '--indexes-first',
128
- '--tables',
129
- source_table_name
130
- ]
131
- child = nil
132
-
133
- # https://github.com/carlhuda/bundler/issues/1579
134
- if defined?(::Bundler)
135
- ::Bundler.with_clean_env do
136
- ::Kernel.system args.join(' ')
137
- end
138
- else
139
- ::Kernel.system args.join(' ')
140
- end
141
- end
142
- end
143
- end
@@ -1,102 +0,0 @@
1
- require 'loose_tight_dictionary'
2
-
3
- class Aircraft < ActiveRecord::Base
4
- set_primary_key :icao_code
5
- set_table_name 'aircraft'
6
-
7
- def self.bts_dictionary
8
- @_dictionary ||= LooseTightDictionary.new RemoteTable.new(:url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv', :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }),
9
- :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
10
- :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
11
- :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
12
- :left_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Model'] },
13
- :right_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
14
- end
15
-
16
- class BtsAircraftTypeCodeMatcher
17
- def match(left_record)
18
- right_record = Aircraft.bts_dictionary.left_to_right left_record
19
- right_record['Aircraft Type'] if right_record
20
- end
21
- end
22
-
23
- class BtsNameMatcher
24
- def match(left_record)
25
- right_record = Aircraft.bts_dictionary.left_to_right left_record
26
- right_record['Manufacturer'] + ' ' + right_record['Long Name'] if right_record
27
- end
28
- end
29
-
30
- class Guru
31
- # for errata
32
- def is_attributed_to_boeing?(row)
33
- row['Manufacturer'] =~ /BOEING/i
34
- end
35
-
36
- def is_not_attributed_to_airbus?(row)
37
- row['Manufacturer'] =~ /AIRBUS/i
38
- end
39
-
40
- def is_attributed_to_cessna?(row)
41
- row['Manufacturer'] =~ /CESSNA/i
42
- end
43
-
44
- def is_attributed_to_fokker?(row)
45
- row['Manufacturer'] =~ /FOKKER/i
46
- end
47
-
48
- def is_not_attributed_to_aerospatiale?(row)
49
- not row['Manufacturer'] =~ /AEROSPATIALE/i
50
- end
51
-
52
- def is_not_attributed_to_cessna?(row)
53
- not row['Manufacturer'] =~ /CESSNA/i
54
- end
55
-
56
- def is_not_attributed_to_learjet?(row)
57
- not row['Manufacturer'] =~ /LEAR/i
58
- end
59
-
60
- def is_not_attributed_to_dehavilland?(row)
61
- not row['Manufacturer'] =~ /DE ?HAVILLAND/i
62
- end
63
-
64
- def is_not_attributed_to_mcdonnell_douglas?(row)
65
- not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
66
- end
67
-
68
- def is_not_a_dc_plane?(row)
69
- not row['Model'] =~ /DC/i
70
- end
71
-
72
- def is_a_crj_900?(row)
73
- row['Designator'].downcase == 'crj9'
74
- end
75
- end
76
-
77
- data_miner do
78
- # ('A'..'Z').each do |letter|
79
- # Note: for the purposes of testing, only importing "D"
80
- %w{ D }.each do |letter|
81
- import("ICAO codes starting with letter #{letter} used by the FAA",
82
- :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
83
- :encoding => 'windows-1252',
84
- :errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => 'Aircraft::Guru' },
85
- :row_xpath => '//table/tr[2]/td/table/tr',
86
- :column_xpath => 'td') do
87
- key 'icao_code', :field_name => 'Designator'
88
- store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new, :nullify => true
89
- store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new, :nullify => true
90
- store 'manufacturer_name', :field_name => 'Manufacturer', :nullify => true
91
- store 'name', :field_name => 'Model', :nullify => true
92
- end
93
-
94
- import 'Brighter Planet aircraft class codes',
95
- :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
96
- key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
97
- store 'brighter_planet_aircraft_class_code', :nullify => true
98
- end
99
- end
100
- end
101
- end
102
-