airport_scraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +36 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/airport_scraper.rb +169 -0
- data/lib/ca_airports.yml +133 -0
- data/lib/intl_airports.yml +8277 -0
- data/lib/us_airports.yml +2370 -0
- data/script/console +10 -0
- data/test/bad_matches.yml +10 -0
- data/test/ca_airports_tests.yml +102 -0
- data/test/helper.rb +10 -0
- data/test/intl_airports_tests.yml +2 -0
- data/test/test_airport_scraper.rb +143 -0
- data/test/us_airports_tests.yml +71 -0
- metadata +92 -0
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/airport_scraper.rb'}"
|
9
|
+
puts "Loading airport_scraper gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
@@ -0,0 +1,10 @@
|
|
1
|
+
- "Taliban factions compete for credit in CIA bombing deaths - It was one of the worst blows ever to America's int.. http://bit.ly/8uo7l4"
|
2
|
+
- "@therealjuicyj #wtf does 'goin' ham' mean?! i hear you use it but we don't say that in toronto at all.."
|
3
|
+
- "Angelina Jolie and Brad Pitt take their kids to the Mary Poppins show on Broadway in NYC – January 3 â. http://bit.ly/6vcSl1"
|
4
|
+
- "back in san diego, ready to boogaaayy. :]"
|
5
|
+
- "In SLC Sky Club watching Jets game. http://myloc.me/2AFzh"
|
6
|
+
- "It's been real! See you in ATL shawty! RT @CEOTUFFLUV: it was fun! Last night in Buffalo!"
|
7
|
+
- "'What is Justin Beiber's birthday? Justin Bieber (real name Alex Lawrence) was born March 14th, 1994 in London.' -- LMAO! LOSERS!"
|
8
|
+
- "Going to climb into bed with a book. Work in Philly resumes tomorrow."
|
9
|
+
- "RT @ourpdx: Do you want a chance to win tickets for Xanadu's stage perf in PDX? Read this: http://bit.ly/8LnD70"
|
10
|
+
|
@@ -0,0 +1,102 @@
|
|
1
|
+
YCD:
|
2
|
+
- "On a flight to Nanaimo"
|
3
|
+
|
4
|
+
YEG:
|
5
|
+
- "Flying from NYC to YEG"
|
6
|
+
- "Landing in Edmonton now. Oh Canada!"
|
7
|
+
- "On the plane to Alberta"
|
8
|
+
|
9
|
+
YFC:
|
10
|
+
- "Touched down in Fredericton. Now to drive home."
|
11
|
+
|
12
|
+
YGK:
|
13
|
+
- "On the plane to Kingston Ontario"
|
14
|
+
- "Landing in Kingston, ON"
|
15
|
+
- "Flying from New York City to Kingston, Ontario"
|
16
|
+
|
17
|
+
YHZ:
|
18
|
+
- "Landed in Halifax. Now to find the Trailer Park Boys"
|
19
|
+
|
20
|
+
YKF:
|
21
|
+
- "On a jet to Kitchener."
|
22
|
+
|
23
|
+
YMQ:
|
24
|
+
- "Landing in Montreal next stop: Poutine"
|
25
|
+
|
26
|
+
YOW:
|
27
|
+
- "Just touched down in Ottawa"
|
28
|
+
|
29
|
+
YUL:
|
30
|
+
- "Landing at Montreal Dorval airport"
|
31
|
+
- "Landed at Dorval and ready to leave the plane"
|
32
|
+
|
33
|
+
YQB:
|
34
|
+
- "On a flight to Quebec City"
|
35
|
+
|
36
|
+
YQM:
|
37
|
+
- "Landing in Moncton, you probably don't know where this is"
|
38
|
+
|
39
|
+
YQR:
|
40
|
+
- "On a turboprop to Regina"
|
41
|
+
|
42
|
+
YQT:
|
43
|
+
- "Landed in Thunder Bay"
|
44
|
+
|
45
|
+
YVR:
|
46
|
+
- "Flying to Vancouver tomorrow for the Winter Olympics."
|
47
|
+
|
48
|
+
YWG:
|
49
|
+
- "Landing in Winnipeg."
|
50
|
+
|
51
|
+
YYZ:
|
52
|
+
- "Good times flying from YYZ to SFO today. Why is our security so reactive? Fear? If terror was Umar Farouk's goal, then Mission Accomplished."
|
53
|
+
- "flying back to Toronto this morning. Sad"
|
54
|
+
|
55
|
+
# YXU:
|
56
|
+
# city: "London, ON"
|
57
|
+
# match_priority: 100
|
58
|
+
# matchers:
|
59
|
+
# - "London, ON"
|
60
|
+
# - "London Ontario"
|
61
|
+
# - "London, Ontario"
|
62
|
+
|
63
|
+
# YXY:
|
64
|
+
# city: "Whitehorse, YT"
|
65
|
+
# matchers:
|
66
|
+
# - "Whitehorse"
|
67
|
+
#
|
68
|
+
# YYC:
|
69
|
+
# city: Calgary
|
70
|
+
# county: ca
|
71
|
+
# matchers:
|
72
|
+
# - Calgary
|
73
|
+
#
|
74
|
+
# YYG:
|
75
|
+
# city: "Charlottetown, PEI"
|
76
|
+
# country: ca
|
77
|
+
# matchers:
|
78
|
+
# - "Charlottetown"
|
79
|
+
# - "Prince Edward Island"
|
80
|
+
# - "P.E.I."
|
81
|
+
#
|
82
|
+
# YYJ:
|
83
|
+
# city: "Victoria, BC"
|
84
|
+
# matchers:
|
85
|
+
# - "Victoria"
|
86
|
+
#
|
87
|
+
# YYT:
|
88
|
+
# city: "Saint Johns, NL"
|
89
|
+
# matchers:
|
90
|
+
# - "Saint Johns"
|
91
|
+
# - "St. Johns"
|
92
|
+
# - "St.Johns"
|
93
|
+
#
|
94
|
+
# YYZ:
|
95
|
+
# city: Toronto
|
96
|
+
# matchers:
|
97
|
+
# - "Toronto"
|
98
|
+
#
|
99
|
+
# YZF:
|
100
|
+
# city: "Yellowknife, NT"
|
101
|
+
# matchers:
|
102
|
+
# - "Yellowknife"
|
data/test/helper.rb
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
CA_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "ca_airports_tests.yml"))
|
5
|
+
US_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "us_airports_tests.yml"))
|
6
|
+
INTL_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "intl_airports_tests.yml"))
|
7
|
+
BAD_MATCHES = YAML.load_file(File.join(File.dirname(__FILE__), "bad_matches.yml"))
|
8
|
+
|
9
|
+
class TestAirportScraper < Test::Unit::TestCase
|
10
|
+
context "new" do
|
11
|
+
setup do
|
12
|
+
@scrape = AirportScraper.new
|
13
|
+
end
|
14
|
+
|
15
|
+
should "load the airports.yml file into @airports" do
|
16
|
+
airports = @scrape.airports
|
17
|
+
assert_not_nil airports
|
18
|
+
assert_not_nil airports['JFK'], "Didn't find JFK in airports"
|
19
|
+
end
|
20
|
+
|
21
|
+
should_eventually "create an @code_match_regex to match 3-letter codes" do
|
22
|
+
code_regex = @scrape.instance_variable_get("@code_match_regex")
|
23
|
+
assert_not_nil(code_regex)
|
24
|
+
assert_match(code_regex, 'JFK')
|
25
|
+
assert_no_match(code_regex, 'JFKX')
|
26
|
+
assert_no_match(code_regex, 'jfk')
|
27
|
+
end
|
28
|
+
|
29
|
+
should_eventually "create an @airport_regex" do
|
30
|
+
name_regex = @scrape.instance_variable_get("@airport_regex")
|
31
|
+
assert_not_nil(name_regex)
|
32
|
+
assert_match(name_regex, "Heathrow")
|
33
|
+
assert_match(name_regex, "heathrow")
|
34
|
+
assert_no_match(name_regex, "HeathrowX")
|
35
|
+
end
|
36
|
+
|
37
|
+
should "create an @matcher_prefixes array" do
|
38
|
+
by_priority = @scrape.instance_variable_get("@matcher_prefixes")
|
39
|
+
assert_not_nil(by_priority)
|
40
|
+
end
|
41
|
+
|
42
|
+
should "order @matcher_prefixes values in descending match_priority order" do
|
43
|
+
# Check that PWM comes before PDX
|
44
|
+
pref = @scrape.instance_variable_get("@matcher_prefixes")
|
45
|
+
by_priority = pref[@scrape.prefix_from_match("Portland")]
|
46
|
+
|
47
|
+
assert_not_nil by_priority
|
48
|
+
pdx = by_priority.detect {|x| x['code'] == 'PDX'}
|
49
|
+
pwm = by_priority.detect {|x| x['code'] == 'PWM'}
|
50
|
+
|
51
|
+
pdx_idx = by_priority.index(pdx)
|
52
|
+
pwm_idx = by_priority.index(pwm)
|
53
|
+
assert_not_nil pdx_idx
|
54
|
+
assert_not_nil pwm_idx
|
55
|
+
|
56
|
+
assert(pwm_idx < pdx_idx)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "possible_flight?" do
|
61
|
+
setup do
|
62
|
+
@scrape = AirportScraper.new
|
63
|
+
end
|
64
|
+
|
65
|
+
["on a flight to Rome", "flying to SFO", "just touched down in Vegas", "EWR to NYC", "EWR -> NYC"].each do |phrase|
|
66
|
+
should "return true for the phrase '#{phrase}'" do
|
67
|
+
assert @scrape.possible_flight?(phrase)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "extract_airports" do
|
73
|
+
setup do
|
74
|
+
@scrape = AirportScraper.new
|
75
|
+
end
|
76
|
+
|
77
|
+
context "when there are no airports in the text" do
|
78
|
+
should "return an empty_array" do
|
79
|
+
assert_equal [], @scrape.extract_airports("Twas brillig and the slithy toves")
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "Airport code tests" do
|
84
|
+
setup do
|
85
|
+
@scrape = AirportScraper.new
|
86
|
+
end
|
87
|
+
|
88
|
+
should_eventually "be able to match the airport codes" do
|
89
|
+
@scrape.airports.each do |airport|
|
90
|
+
assert_contains @scrape.extract_airports("Just landed in #{airport['code']}."), airport
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
context "Freeform name test" do
|
96
|
+
[US_TESTS, CA_TESTS, INTL_TESTS].each do |tests|
|
97
|
+
tests.keys.each do |code|
|
98
|
+
tests[code].each do |str|
|
99
|
+
should "return the airport #{code} for phrase '#{str}'" do
|
100
|
+
airport = @scrape.airport(code)
|
101
|
+
results = @scrape.extract_airports(str)
|
102
|
+
assert_contains results, airport, "Expected #{code}, returned #{results.map {|x| x['code']}.inspect }"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
context "Matchers" do
|
110
|
+
setup do
|
111
|
+
@scape = AirportScraper.new
|
112
|
+
end
|
113
|
+
|
114
|
+
should "not have duplicate matchers for two airports" do
|
115
|
+
matchers = {}
|
116
|
+
airports = @scrape.airports
|
117
|
+
|
118
|
+
airports.values.each do |airport|
|
119
|
+
airport['matchers'].each do |matcher|
|
120
|
+
if matchers[matcher].nil?
|
121
|
+
matchers[matcher] = airport
|
122
|
+
else
|
123
|
+
# if matchers[matcher]['code']['match_priority'] == airport['code']['match_priority']
|
124
|
+
flunk "Matcher '#{matcher}' for more than one airport (#{matchers[matcher]['code']}, #{airport['code']}) at same priority"
|
125
|
+
# end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
should_eventually "not have shorter matchers for a name with a match_priority greater than a longer variant"
|
132
|
+
end
|
133
|
+
|
134
|
+
context "Bad matches" do
|
135
|
+
BAD_MATCHES.each do |str|
|
136
|
+
should "not return any airports for phrase '#{str}'" do
|
137
|
+
results = @scrape.extract_airports(str)
|
138
|
+
assert_equal [], results, "Should not have matched anything, returned #{results.map {|x| x['code']}.inspect }"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
ABQ:
|
2
|
+
- "W'sup Dallas? 2 hr flight to albuquerque. There for two days and then to colorado to see the brothers for a week :)"
|
3
|
+
- "Jus landed in new mexico...my 1st time here"
|
4
|
+
- "just landed in ABQ, new mexico! hello coooold!"
|
5
|
+
|
6
|
+
BIL:
|
7
|
+
- "Making a quick run to SeaTac to drop off T@. She's flying to Billings but needs be in Bismark"
|
8
|
+
|
9
|
+
CLT:
|
10
|
+
- "Landed in Charlotte..."
|
11
|
+
|
12
|
+
CMH:
|
13
|
+
- "working a terrible shift, but flying to Columbus in the morning!"
|
14
|
+
|
15
|
+
DAL:
|
16
|
+
- "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
|
17
|
+
|
18
|
+
DEN:
|
19
|
+
- "Landed in Denver, now off to the terminal to await the arrival of my mother. I can't wait to consume mass amounts of turkey on Thursday."
|
20
|
+
|
21
|
+
DFW:
|
22
|
+
- "Just landed in Dallas"
|
23
|
+
|
24
|
+
EWR:
|
25
|
+
- "EWR Thanksgiving here we come. MIA to EWR to BDL"
|
26
|
+
- "just landed in NJ... its a gud look so far!!"
|
27
|
+
|
28
|
+
FLL:
|
29
|
+
- "Landed in Ft. Lauderdale. Driving to Boca."
|
30
|
+
|
31
|
+
JFK:
|
32
|
+
- "This little brat just spilled water on me during my 40 minute delayed flight to jfk. FML"
|
33
|
+
|
34
|
+
LAS:
|
35
|
+
- "Touched down in Vegas. Vegas baby!"
|
36
|
+
- "Landed in Vegas, watch out."
|
37
|
+
|
38
|
+
LAX:
|
39
|
+
- "Just landed in LA sans DJ equip, but its good to be back."
|
40
|
+
|
41
|
+
LGA:
|
42
|
+
- "kind of weird...Karlie Kloss was on my flight to LaGuardia today. haha I was tempted to get a picture with her, but she looked exhausted!"
|
43
|
+
|
44
|
+
LIT:
|
45
|
+
- "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
|
46
|
+
|
47
|
+
OAK:
|
48
|
+
- "flight to Oakland departs at 10:25. And I only just got on a shuttle from the parking lots. Here's hoping I don't miss my flight!"
|
49
|
+
|
50
|
+
PDX:
|
51
|
+
- "On my flight to Portland. Looking forward to kicking back at home."
|
52
|
+
- "Landing in PDX now. See you all soon."
|
53
|
+
|
54
|
+
PWM:
|
55
|
+
- "Landed in Portland ME"
|
56
|
+
|
57
|
+
RDU:
|
58
|
+
- "We just landed in Raleigh. Now its time to pack, go to church, and leave tonight to go to Mom's for Thanksgiving!"
|
59
|
+
|
60
|
+
SDF:
|
61
|
+
- "After leaving at 4:30am...just landed in Louisville. And yes, we arrived without a turkey smashing our windshield this year."
|
62
|
+
|
63
|
+
SFO:
|
64
|
+
- "@ San Francisco Airport waiting for my flight to Delhi via Beijing. Delayed by 3 hrs but knew that this morning thanks to Twitter."
|
65
|
+
|
66
|
+
SEA:
|
67
|
+
- "On the same Virgin America flight to Seattle with Top Chef's Marcel whose hair was perfectly peaked even at 9 am"
|
68
|
+
- "Alaska/Horizon FTW - on an earlier flight to SeaTac!"
|
69
|
+
|
70
|
+
STL:
|
71
|
+
- "Just landed in St.Louis, its raining outside smh I'm still going 2 enjoy my break"
|
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: airport_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jacob Harris
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-01 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: shoulda
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: A gem for extracting airport codes from text
|
36
|
+
email: jharris@nytimes.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.rdoc
|
44
|
+
files:
|
45
|
+
- .document
|
46
|
+
- .gitignore
|
47
|
+
- LICENSE
|
48
|
+
- README.rdoc
|
49
|
+
- Rakefile
|
50
|
+
- VERSION
|
51
|
+
- lib/airport_scraper.rb
|
52
|
+
- lib/ca_airports.yml
|
53
|
+
- lib/intl_airports.yml
|
54
|
+
- lib/us_airports.yml
|
55
|
+
- script/console
|
56
|
+
- test/bad_matches.yml
|
57
|
+
- test/ca_airports_tests.yml
|
58
|
+
- test/helper.rb
|
59
|
+
- test/intl_airports_tests.yml
|
60
|
+
- test/test_airport_scraper.rb
|
61
|
+
- test/us_airports_tests.yml
|
62
|
+
has_rdoc: true
|
63
|
+
homepage: http://github.com/harrisj/airport_scraper
|
64
|
+
licenses: []
|
65
|
+
|
66
|
+
post_install_message:
|
67
|
+
rdoc_options:
|
68
|
+
- --charset=UTF-8
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: "0"
|
82
|
+
version:
|
83
|
+
requirements: []
|
84
|
+
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 1.3.5
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: A gem for extracting airports from mentions in text
|
90
|
+
test_files:
|
91
|
+
- test/helper.rb
|
92
|
+
- test/test_airport_scraper.rb
|