airport_scraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +36 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/airport_scraper.rb +169 -0
- data/lib/ca_airports.yml +133 -0
- data/lib/intl_airports.yml +8277 -0
- data/lib/us_airports.yml +2370 -0
- data/script/console +10 -0
- data/test/bad_matches.yml +10 -0
- data/test/ca_airports_tests.yml +102 -0
- data/test/helper.rb +10 -0
- data/test/intl_airports_tests.yml +2 -0
- data/test/test_airport_scraper.rb +143 -0
- data/test/us_airports_tests.yml +71 -0
- metadata +92 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Jacob Harris
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
= airport_scraper
|
2
|
+
|
3
|
+
A gem for extracting Airport mentions from short snippets of text. Just something I threw together as an experiment that's turned into an interesting hobby project.
|
4
|
+
|
5
|
+
== Examples
|
6
|
+
|
7
|
+
scraper = AirportScraper.new
|
8
|
+
pdx1 = scrape.extract_airports("On my flight to Portland. Looking forward to kicking back at home.")
|
9
|
+
pdx2 = scrape.extract_airports("Landing in PDX now. See you all soon.")
|
10
|
+
|
11
|
+
Both pdx1 and pdx2 would be an array with an Airport hash for Portland International Airport
|
12
|
+
|
13
|
+
multi = scrape.extract_airports("On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!")
|
14
|
+
assert_equal ['LIT', 'DAL'], multi.map {|x| x['code']}
|
15
|
+
|
16
|
+
== Possible Future Work
|
17
|
+
|
18
|
+
* Ways to limit scope to major airports only or specific countries
|
19
|
+
* More airport information
|
20
|
+
* Geocoding/WOEIDs for airports
|
21
|
+
* Ordering of airports in the result array to reflect trip order
|
22
|
+
|
23
|
+
== Note on Patches/Pull Requests
|
24
|
+
|
25
|
+
* Fork the project.
|
26
|
+
* Make your feature addition or bug fix.
|
27
|
+
* Add tests for it. This is important so I don't break it in a
|
28
|
+
future version unintentionally.
|
29
|
+
* Commit, do not mess with rakefile, version, or history.
|
30
|
+
(if you want to have your own version, that is fine but
|
31
|
+
bump version in a commit by itself I can ignore when I pull)
|
32
|
+
* Send me a pull request. Bonus points for topic branches.
|
33
|
+
|
34
|
+
== Copyright
|
35
|
+
|
36
|
+
Copyright (c) 2009 Jacob Harris. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "airport_scraper"
|
8
|
+
gem.summary = %Q{A gem for extracting airports from mentions in text}
|
9
|
+
gem.description = %Q{A gem for extracting airport codes from text}
|
10
|
+
gem.email = "jharris@nytimes.com"
|
11
|
+
gem.homepage = "http://github.com/harrisj/airport_scraper"
|
12
|
+
gem.authors = ["Jacob Harris"]
|
13
|
+
gem.add_development_dependency "shoulda", ">= 0"
|
14
|
+
gem.add_development_dependency "yard", ">= 0"
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/test_*.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/test_*.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
task :test => :check_dependencies
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
begin
|
47
|
+
require 'yard'
|
48
|
+
YARD::Rake::YardocTask.new
|
49
|
+
rescue LoadError
|
50
|
+
task :yardoc do
|
51
|
+
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
52
|
+
end
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
class AirportScraper
|
4
|
+
attr_reader :airports, :airport_codes
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
load_airports
|
8
|
+
create_regexes
|
9
|
+
end
|
10
|
+
|
11
|
+
def regex_from_matchers(matchers)
|
12
|
+
if matchers.nil? || matchers.empty?
|
13
|
+
nil
|
14
|
+
else
|
15
|
+
/^(#{matchers.map {|x| x.gsub(".", "\\.")}.join('|')})\b/i
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_airports
|
20
|
+
@airports = {}
|
21
|
+
|
22
|
+
%w(ca_airports us_airports intl_airports).each do |file|
|
23
|
+
@airports.merge!(YAML.load_file(File.join(File.dirname(__FILE__), "#{file}.yml")))
|
24
|
+
end
|
25
|
+
|
26
|
+
@matcher_prefixes = {}
|
27
|
+
|
28
|
+
@airports.each do |key, value|
|
29
|
+
value['code'] = key
|
30
|
+
value['major'] ||= false
|
31
|
+
value['match_priority'] ||= value['major'] ? 10 : 0
|
32
|
+
value['name'] ||= value['city']
|
33
|
+
value['matchers'] = case value['matchers']
|
34
|
+
when nil
|
35
|
+
[]
|
36
|
+
when Array
|
37
|
+
value['matchers']
|
38
|
+
else
|
39
|
+
value['matchers'].to_a
|
40
|
+
end
|
41
|
+
|
42
|
+
value['regex'] = regex_from_matchers(value['matchers'])
|
43
|
+
|
44
|
+
unless value['matchers'].nil?
|
45
|
+
prefixes = value['matchers'].map {|x| prefix_from_match(x)}.uniq
|
46
|
+
prefixes.each do |p|
|
47
|
+
@matcher_prefixes[p] ||= []
|
48
|
+
@matcher_prefixes[p] << value
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
@airport_codes = @airports.keys
|
54
|
+
|
55
|
+
@matcher_prefixes.values.each do |airports|
|
56
|
+
airports.sort! {|a, b| b['match_priority'] <=> a['match_priority'] }
|
57
|
+
end
|
58
|
+
|
59
|
+
# raise @matcher_prefixes.inspect
|
60
|
+
end
|
61
|
+
|
62
|
+
def prefix_from_match(str)
|
63
|
+
case str
|
64
|
+
when /\w\w\b/
|
65
|
+
str[0,2].downcase
|
66
|
+
else
|
67
|
+
str[0,3].downcase
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def create_regexes
|
72
|
+
@code_match_regex = /\b([A-Z]{3})\b/
|
73
|
+
|
74
|
+
flight_regex = /(flight|flying|plane|jet|turboprop)(\s(back|again|over))?/i
|
75
|
+
|
76
|
+
@trans_regex = /(\sto\s)|(\s?->\s?)|(\s?>\s?)|(\s?✈\s?)/
|
77
|
+
@via_regex = /,?\s?(via|by way of|on route to)\s/
|
78
|
+
|
79
|
+
@preposition_regex = /\sfrom\s|\sto\s|#{@via_regex}|#{@trans_regex}|\sin\s|\sat\s|@\s|\sout of\s/i
|
80
|
+
|
81
|
+
airport_regex = /(.+)/
|
82
|
+
prep_airport_regex = /(.+)(?=#{@preposition_regex})/
|
83
|
+
|
84
|
+
@match_regexes = [
|
85
|
+
[/(#{@code_match_regex}(#{@trans_regex}(#{@code_match_regex}))+\b)/, @trans_regex],# (#{@via_regex}#{@code_match_regex}\b)?)/i,
|
86
|
+
/((at|@|in) #{airport_regex} airport)/i,
|
87
|
+
/((boarding|departing) (to|from|in) #{airport_regex})/i,
|
88
|
+
/(touched down in #{airport_regex})\b/i,
|
89
|
+
/((to land)|(land(ed|ing|s)) (in|at) #{airport_regex})\b/i,
|
90
|
+
[/(#{flight_regex}#{@preposition_regex}(#{prep_airport_regex}#{@preposition_regex})*#{airport_regex}+)/i, @trans_regex],
|
91
|
+
#/(#{flight_regex}( (from|in|at|out of) #{airport_regex})? (to|into|towards) #{airport_regex}(#{@via_regex}#{airport_regex})?)/i,
|
92
|
+
]
|
93
|
+
end
|
94
|
+
|
95
|
+
def airport(code)
|
96
|
+
@airports[code]
|
97
|
+
end
|
98
|
+
|
99
|
+
def flight_terms
|
100
|
+
%w(touched landed landing land lands plane jet turboprop flying flight boarding departing)
|
101
|
+
end
|
102
|
+
|
103
|
+
def possible_flight?(text)
|
104
|
+
@match_regexes.any? do |regex_pair|
|
105
|
+
if regex_pair.is_a? (Array)
|
106
|
+
regex = regex_pair[0]
|
107
|
+
else
|
108
|
+
regex = regex_pair
|
109
|
+
end
|
110
|
+
|
111
|
+
regex =~ text
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def is_flight(text)
|
116
|
+
true
|
117
|
+
end
|
118
|
+
|
119
|
+
def extract_airports(text)
|
120
|
+
airports = []
|
121
|
+
|
122
|
+
#puts @airport_regex.inspect
|
123
|
+
|
124
|
+
@match_regexes.each do |regex_pair|
|
125
|
+
|
126
|
+
case regex_pair
|
127
|
+
when Array
|
128
|
+
regex = regex_pair[0]
|
129
|
+
split_regex = regex_pair[1]
|
130
|
+
else
|
131
|
+
regex = regex_pair
|
132
|
+
end
|
133
|
+
|
134
|
+
text.scan(regex) do |matches|
|
135
|
+
if split_regex
|
136
|
+
matches = matches.compact.map {|m| m.split(split_regex)}.flatten.uniq
|
137
|
+
end
|
138
|
+
|
139
|
+
#puts "Matches: #{matches.compact.inspect}"
|
140
|
+
# puts "Text: #{text}"
|
141
|
+
# puts "Regex: #{regex.inspect}"
|
142
|
+
matches.compact.each do |match|
|
143
|
+
next if match.nil? || match.length < 2
|
144
|
+
|
145
|
+
if match =~ /^#{@code_match_regex}/
|
146
|
+
#puts "MATCH: #{match}"
|
147
|
+
airport = @airports[$1]
|
148
|
+
airports << airport unless airport.nil?
|
149
|
+
else
|
150
|
+
possible_airports = @matcher_prefixes[prefix_from_match(match)]
|
151
|
+
unless possible_airports.nil?
|
152
|
+
possible_airports.each do |a|
|
153
|
+
next if a['regex'].nil?
|
154
|
+
if match =~ /#{a['regex']}\b/
|
155
|
+
airports << a
|
156
|
+
break
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# break
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
airports.uniq
|
168
|
+
end
|
169
|
+
end
|
data/lib/ca_airports.yml
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
YCD:
|
2
|
+
city: "Nanaimo, BC"
|
3
|
+
matchers:
|
4
|
+
- Nanaimo
|
5
|
+
|
6
|
+
YEG:
|
7
|
+
city: "Edmonton"
|
8
|
+
matchers:
|
9
|
+
- "Edmonton"
|
10
|
+
- "Alberta"
|
11
|
+
|
12
|
+
YFC:
|
13
|
+
city: "Fredericton, NB"
|
14
|
+
matchers:
|
15
|
+
- "Fredericton"
|
16
|
+
|
17
|
+
YGK:
|
18
|
+
name: "Norman Rogers Airport"
|
19
|
+
city: "Kingston, ON"
|
20
|
+
match_priority: 100
|
21
|
+
matchers:
|
22
|
+
- "Kingston, ON"
|
23
|
+
- "Kingston ON"
|
24
|
+
- "Kingston Ontario"
|
25
|
+
- "Kingston, Ontario"
|
26
|
+
|
27
|
+
YHZ:
|
28
|
+
city: "Halifax, NS"
|
29
|
+
matchers:
|
30
|
+
- "Halifax"
|
31
|
+
|
32
|
+
YKF:
|
33
|
+
city: "Kitchener, ON"
|
34
|
+
matchers:
|
35
|
+
- Kitchener
|
36
|
+
|
37
|
+
YMQ:
|
38
|
+
city: "Montreal, QC (All Airports)"
|
39
|
+
matchers:
|
40
|
+
- Montreal
|
41
|
+
|
42
|
+
YOW:
|
43
|
+
city: Ottawa
|
44
|
+
matchers:
|
45
|
+
- Ottawa
|
46
|
+
|
47
|
+
YUL:
|
48
|
+
name: "Montreal Dorval Airport"
|
49
|
+
city: "Montreal, QC"
|
50
|
+
match_priority: 50
|
51
|
+
matchers:
|
52
|
+
- "Dorval"
|
53
|
+
- "Montreal Dorval"
|
54
|
+
|
55
|
+
YQB:
|
56
|
+
city: Quebec City
|
57
|
+
matchers:
|
58
|
+
- Quebec City
|
59
|
+
|
60
|
+
YQM:
|
61
|
+
city: "Moncton, NB"
|
62
|
+
matchers:
|
63
|
+
- "Moncton"
|
64
|
+
|
65
|
+
YQR:
|
66
|
+
city: "Regina, SK"
|
67
|
+
matchers:
|
68
|
+
- "Regina"
|
69
|
+
|
70
|
+
YQT:
|
71
|
+
city: "Thunder Bay, ON"
|
72
|
+
matchers:
|
73
|
+
- Thunder Bay
|
74
|
+
|
75
|
+
YVR:
|
76
|
+
name: Vancouver International Airport
|
77
|
+
city: "Vancouver, BC"
|
78
|
+
matchers:
|
79
|
+
- "Vancouver"
|
80
|
+
|
81
|
+
YWG:
|
82
|
+
city: "Winnipeg"
|
83
|
+
matchers:
|
84
|
+
- Winnipeg
|
85
|
+
|
86
|
+
YXU:
|
87
|
+
city: "London, ON"
|
88
|
+
match_priority: 100
|
89
|
+
matchers:
|
90
|
+
- "London, ON"
|
91
|
+
- "London Ontario"
|
92
|
+
- "London, Ontario"
|
93
|
+
|
94
|
+
YXY:
|
95
|
+
city: "Whitehorse, YT"
|
96
|
+
matchers:
|
97
|
+
- "Whitehorse"
|
98
|
+
|
99
|
+
YYC:
|
100
|
+
city: Calgary
|
101
|
+
county: ca
|
102
|
+
matchers:
|
103
|
+
- Calgary
|
104
|
+
|
105
|
+
YYG:
|
106
|
+
city: "Charlottetown, PEI"
|
107
|
+
country: ca
|
108
|
+
matchers:
|
109
|
+
- "Charlottetown"
|
110
|
+
- "Prince Edward Island"
|
111
|
+
- "P.E.I."
|
112
|
+
|
113
|
+
YYJ:
|
114
|
+
city: "Victoria, BC"
|
115
|
+
matchers:
|
116
|
+
- "Victoria"
|
117
|
+
|
118
|
+
YYT:
|
119
|
+
city: "Saint Johns, NL"
|
120
|
+
matchers:
|
121
|
+
- "Saint Johns"
|
122
|
+
- "St. Johns"
|
123
|
+
- "St.Johns"
|
124
|
+
|
125
|
+
YYZ:
|
126
|
+
city: Toronto
|
127
|
+
matchers:
|
128
|
+
- "Toronto"
|
129
|
+
|
130
|
+
YZF:
|
131
|
+
city: "Yellowknife, NT"
|
132
|
+
matchers:
|
133
|
+
- "Yellowknife"
|