airport_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Jacob Harris
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ = airport_scraper
2
+
3
+ A gem for extracting Airport mentions from short snippets of text. Just something I threw together as an experiment that's turned into an interesting hobby project.
4
+
5
+ == Examples
6
+
7
+ scraper = AirportScraper.new
8
+ pdx1 = scrape.extract_airports("On my flight to Portland. Looking forward to kicking back at home.")
9
+ pdx2 = scrape.extract_airports("Landing in PDX now. See you all soon.")
10
+
11
+ Both pdx1 and pdx2 would be an array with an Airport hash for Portland International Airport
12
+
13
+ multi = scrape.extract_airports("On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!")
14
+ assert_equal ['LIT', 'DAL'], multi.map {|x| x['code']}
15
+
16
+ == Possible Future Work
17
+
18
+ * Ways to limit scope to major airports only or specific countries
19
+ * More airport information
20
+ * Geocoding/WOEIDs for airports
21
+ * Ordering of airports in the result array to reflect trip order
22
+
23
+ == Note on Patches/Pull Requests
24
+
25
+ * Fork the project.
26
+ * Make your feature addition or bug fix.
27
+ * Add tests for it. This is important so I don't break it in a
28
+ future version unintentionally.
29
+ * Commit, do not mess with rakefile, version, or history.
30
+ (if you want to have your own version, that is fine but
31
+ bump version in a commit by itself I can ignore when I pull)
32
+ * Send me a pull request. Bonus points for topic branches.
33
+
34
+ == Copyright
35
+
36
+ Copyright (c) 2009 Jacob Harris. See LICENSE for details.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "airport_scraper"
8
+ gem.summary = %Q{A gem for extracting airports from mentions in text}
9
+ gem.description = %Q{A gem for extracting airport codes from text}
10
+ gem.email = "jharris@nytimes.com"
11
+ gem.homepage = "http://github.com/harrisj/airport_scraper"
12
+ gem.authors = ["Jacob Harris"]
13
+ gem.add_development_dependency "shoulda", ">= 0"
14
+ gem.add_development_dependency "yard", ">= 0"
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'rake/testtask'
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << 'lib' << 'test'
25
+ test.pattern = 'test/**/test_*.rb'
26
+ test.verbose = true
27
+ end
28
+
29
+ begin
30
+ require 'rcov/rcovtask'
31
+ Rcov::RcovTask.new do |test|
32
+ test.libs << 'test'
33
+ test.pattern = 'test/**/test_*.rb'
34
+ test.verbose = true
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+ task :test => :check_dependencies
43
+
44
+ task :default => :test
45
+
46
+ begin
47
+ require 'yard'
48
+ YARD::Rake::YardocTask.new
49
+ rescue LoadError
50
+ task :yardoc do
51
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
52
+ end
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,169 @@
1
+ require 'yaml'
2
+
3
+ class AirportScraper
4
+ attr_reader :airports, :airport_codes
5
+
6
+ def initialize
7
+ load_airports
8
+ create_regexes
9
+ end
10
+
11
+ def regex_from_matchers(matchers)
12
+ if matchers.nil? || matchers.empty?
13
+ nil
14
+ else
15
+ /^(#{matchers.map {|x| x.gsub(".", "\\.")}.join('|')})\b/i
16
+ end
17
+ end
18
+
19
+ def load_airports
20
+ @airports = {}
21
+
22
+ %w(ca_airports us_airports intl_airports).each do |file|
23
+ @airports.merge!(YAML.load_file(File.join(File.dirname(__FILE__), "#{file}.yml")))
24
+ end
25
+
26
+ @matcher_prefixes = {}
27
+
28
+ @airports.each do |key, value|
29
+ value['code'] = key
30
+ value['major'] ||= false
31
+ value['match_priority'] ||= value['major'] ? 10 : 0
32
+ value['name'] ||= value['city']
33
+ value['matchers'] = case value['matchers']
34
+ when nil
35
+ []
36
+ when Array
37
+ value['matchers']
38
+ else
39
+ value['matchers'].to_a
40
+ end
41
+
42
+ value['regex'] = regex_from_matchers(value['matchers'])
43
+
44
+ unless value['matchers'].nil?
45
+ prefixes = value['matchers'].map {|x| prefix_from_match(x)}.uniq
46
+ prefixes.each do |p|
47
+ @matcher_prefixes[p] ||= []
48
+ @matcher_prefixes[p] << value
49
+ end
50
+ end
51
+ end
52
+
53
+ @airport_codes = @airports.keys
54
+
55
+ @matcher_prefixes.values.each do |airports|
56
+ airports.sort! {|a, b| b['match_priority'] <=> a['match_priority'] }
57
+ end
58
+
59
+ # raise @matcher_prefixes.inspect
60
+ end
61
+
62
+ def prefix_from_match(str)
63
+ case str
64
+ when /\w\w\b/
65
+ str[0,2].downcase
66
+ else
67
+ str[0,3].downcase
68
+ end
69
+ end
70
+
71
+ def create_regexes
72
+ @code_match_regex = /\b([A-Z]{3})\b/
73
+
74
+ flight_regex = /(flight|flying|plane|jet|turboprop)(\s(back|again|over))?/i
75
+
76
+ @trans_regex = /(\sto\s)|(\s?->\s?)|(\s?>\s?)|(\s?✈\s?)/
77
+ @via_regex = /,?\s?(via|by way of|on route to)\s/
78
+
79
+ @preposition_regex = /\sfrom\s|\sto\s|#{@via_regex}|#{@trans_regex}|\sin\s|\sat\s|@\s|\sout of\s/i
80
+
81
+ airport_regex = /(.+)/
82
+ prep_airport_regex = /(.+)(?=#{@preposition_regex})/
83
+
84
+ @match_regexes = [
85
+ [/(#{@code_match_regex}(#{@trans_regex}(#{@code_match_regex}))+\b)/, @trans_regex],# (#{@via_regex}#{@code_match_regex}\b)?)/i,
86
+ /((at|@|in) #{airport_regex} airport)/i,
87
+ /((boarding|departing) (to|from|in) #{airport_regex})/i,
88
+ /(touched down in #{airport_regex})\b/i,
89
+ /((to land)|(land(ed|ing|s)) (in|at) #{airport_regex})\b/i,
90
+ [/(#{flight_regex}#{@preposition_regex}(#{prep_airport_regex}#{@preposition_regex})*#{airport_regex}+)/i, @trans_regex],
91
+ #/(#{flight_regex}( (from|in|at|out of) #{airport_regex})? (to|into|towards) #{airport_regex}(#{@via_regex}#{airport_regex})?)/i,
92
+ ]
93
+ end
94
+
95
+ def airport(code)
96
+ @airports[code]
97
+ end
98
+
99
+ def flight_terms
100
+ %w(touched landed landing land lands plane jet turboprop flying flight boarding departing)
101
+ end
102
+
103
+ def possible_flight?(text)
104
+ @match_regexes.any? do |regex_pair|
105
+ if regex_pair.is_a? (Array)
106
+ regex = regex_pair[0]
107
+ else
108
+ regex = regex_pair
109
+ end
110
+
111
+ regex =~ text
112
+ end
113
+ end
114
+
115
+ def is_flight(text)
116
+ true
117
+ end
118
+
119
+ def extract_airports(text)
120
+ airports = []
121
+
122
+ #puts @airport_regex.inspect
123
+
124
+ @match_regexes.each do |regex_pair|
125
+
126
+ case regex_pair
127
+ when Array
128
+ regex = regex_pair[0]
129
+ split_regex = regex_pair[1]
130
+ else
131
+ regex = regex_pair
132
+ end
133
+
134
+ text.scan(regex) do |matches|
135
+ if split_regex
136
+ matches = matches.compact.map {|m| m.split(split_regex)}.flatten.uniq
137
+ end
138
+
139
+ #puts "Matches: #{matches.compact.inspect}"
140
+ # puts "Text: #{text}"
141
+ # puts "Regex: #{regex.inspect}"
142
+ matches.compact.each do |match|
143
+ next if match.nil? || match.length < 2
144
+
145
+ if match =~ /^#{@code_match_regex}/
146
+ #puts "MATCH: #{match}"
147
+ airport = @airports[$1]
148
+ airports << airport unless airport.nil?
149
+ else
150
+ possible_airports = @matcher_prefixes[prefix_from_match(match)]
151
+ unless possible_airports.nil?
152
+ possible_airports.each do |a|
153
+ next if a['regex'].nil?
154
+ if match =~ /#{a['regex']}\b/
155
+ airports << a
156
+ break
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end
162
+
163
+ # break
164
+ end
165
+ end
166
+
167
+ airports.uniq
168
+ end
169
+ end
@@ -0,0 +1,133 @@
1
+ YCD:
2
+ city: "Nanaimo, BC"
3
+ matchers:
4
+ - Nanaimo
5
+
6
+ YEG:
7
+ city: "Edmonton"
8
+ matchers:
9
+ - "Edmonton"
10
+ - "Alberta"
11
+
12
+ YFC:
13
+ city: "Fredericton, NB"
14
+ matchers:
15
+ - "Fredericton"
16
+
17
+ YGK:
18
+ name: "Norman Rogers Airport"
19
+ city: "Kingston, ON"
20
+ match_priority: 100
21
+ matchers:
22
+ - "Kingston, ON"
23
+ - "Kingston ON"
24
+ - "Kingston Ontario"
25
+ - "Kingston, Ontario"
26
+
27
+ YHZ:
28
+ city: "Halifax, NS"
29
+ matchers:
30
+ - "Halifax"
31
+
32
+ YKF:
33
+ city: "Kitchener, ON"
34
+ matchers:
35
+ - Kitchener
36
+
37
+ YMQ:
38
+ city: "Montreal, QC (All Airports)"
39
+ matchers:
40
+ - Montreal
41
+
42
+ YOW:
43
+ city: Ottawa
44
+ matchers:
45
+ - Ottawa
46
+
47
+ YUL:
48
+ name: "Montreal Dorval Airport"
49
+ city: "Montreal, QC"
50
+ match_priority: 50
51
+ matchers:
52
+ - "Dorval"
53
+ - "Montreal Dorval"
54
+
55
+ YQB:
56
+ city: Quebec City
57
+ matchers:
58
+ - Quebec City
59
+
60
+ YQM:
61
+ city: "Moncton, NB"
62
+ matchers:
63
+ - "Moncton"
64
+
65
+ YQR:
66
+ city: "Regina, SK"
67
+ matchers:
68
+ - "Regina"
69
+
70
+ YQT:
71
+ city: "Thunder Bay, ON"
72
+ matchers:
73
+ - Thunder Bay
74
+
75
+ YVR:
76
+ name: Vancouver International Airport
77
+ city: "Vancouver, BC"
78
+ matchers:
79
+ - "Vancouver"
80
+
81
+ YWG:
82
+ city: "Winnipeg"
83
+ matchers:
84
+ - Winnipeg
85
+
86
+ YXU:
87
+ city: "London, ON"
88
+ match_priority: 100
89
+ matchers:
90
+ - "London, ON"
91
+ - "London Ontario"
92
+ - "London, Ontario"
93
+
94
+ YXY:
95
+ city: "Whitehorse, YT"
96
+ matchers:
97
+ - "Whitehorse"
98
+
99
+ YYC:
100
+ city: Calgary
101
+ county: ca
102
+ matchers:
103
+ - Calgary
104
+
105
+ YYG:
106
+ city: "Charlottetown, PEI"
107
+ country: ca
108
+ matchers:
109
+ - "Charlottetown"
110
+ - "Prince Edward Island"
111
+ - "P.E.I."
112
+
113
+ YYJ:
114
+ city: "Victoria, BC"
115
+ matchers:
116
+ - "Victoria"
117
+
118
+ YYT:
119
+ city: "Saint Johns, NL"
120
+ matchers:
121
+ - "Saint Johns"
122
+ - "St. Johns"
123
+ - "St.Johns"
124
+
125
+ YYZ:
126
+ city: Toronto
127
+ matchers:
128
+ - "Toronto"
129
+
130
+ YZF:
131
+ city: "Yellowknife, NT"
132
+ matchers:
133
+ - "Yellowknife"