airport_scraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +36 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/airport_scraper.rb +169 -0
- data/lib/ca_airports.yml +133 -0
- data/lib/intl_airports.yml +8277 -0
- data/lib/us_airports.yml +2370 -0
- data/script/console +10 -0
- data/test/bad_matches.yml +10 -0
- data/test/ca_airports_tests.yml +102 -0
- data/test/helper.rb +10 -0
- data/test/intl_airports_tests.yml +2 -0
- data/test/test_airport_scraper.rb +143 -0
- data/test/us_airports_tests.yml +71 -0
- metadata +92 -0
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/airport_scraper.rb'}"
|
9
|
+
puts "Loading airport_scraper gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
@@ -0,0 +1,10 @@
|
|
1
|
+
- "Taliban factions compete for credit in CIA bombing deaths - It was one of the worst blows ever to America's int.. http://bit.ly/8uo7l4"
|
2
|
+
- "@therealjuicyj #wtf does 'goin' ham' mean?! i hear you use it but we don't say that in toronto at all.."
|
3
|
+
- "Angelina Jolie and Brad Pitt take their kids to the Mary Poppins show on Broadway in NYC – January 3 â. http://bit.ly/6vcSl1"
|
4
|
+
- "back in san diego, ready to boogaaayy. :]"
|
5
|
+
- "In SLC Sky Club watching Jets game. http://myloc.me/2AFzh"
|
6
|
+
- "It's been real! See you in ATL shawty! RT @CEOTUFFLUV: it was fun! Last night in Buffalo!"
|
7
|
+
- "'What is Justin Beiber's birthday? Justin Bieber (real name Alex Lawrence) was born March 14th, 1994 in London.' -- LMAO! LOSERS!"
|
8
|
+
- "Going to climb into bed with a book. Work in Philly resumes tomorrow."
|
9
|
+
- "RT @ourpdx: Do you want a chance to win tickets for Xanadu's stage perf in PDX? Read this: http://bit.ly/8LnD70"
|
10
|
+
|
@@ -0,0 +1,102 @@
|
|
1
|
+
YCD:
|
2
|
+
- "On a flight to Nanaimo"
|
3
|
+
|
4
|
+
YEG:
|
5
|
+
- "Flying from NYC to YEG"
|
6
|
+
- "Landing in Edmonton now. Oh Canada!"
|
7
|
+
- "On the plane to Alberta"
|
8
|
+
|
9
|
+
YFC:
|
10
|
+
- "Touched down in Fredericton. Now to drive home."
|
11
|
+
|
12
|
+
YGK:
|
13
|
+
- "On the plane to Kingston Ontario"
|
14
|
+
- "Landing in Kingston, ON"
|
15
|
+
- "Flying from New York City to Kingston, Ontario"
|
16
|
+
|
17
|
+
YHZ:
|
18
|
+
- "Landed in Halifax. Now to find the Trailer Park Boys"
|
19
|
+
|
20
|
+
YKF:
|
21
|
+
- "On a jet to Kitchener."
|
22
|
+
|
23
|
+
YMQ:
|
24
|
+
- "Landing in Montreal next stop: Poutine"
|
25
|
+
|
26
|
+
YOW:
|
27
|
+
- "Just touched down in Ottawa"
|
28
|
+
|
29
|
+
YUL:
|
30
|
+
- "Landing at Montreal Dorval airport"
|
31
|
+
- "Landed at Dorval and ready to leave the plane"
|
32
|
+
|
33
|
+
YQB:
|
34
|
+
- "On a flight to Quebec City"
|
35
|
+
|
36
|
+
YQM:
|
37
|
+
- "Landing in Moncton, you probably don't know where this is"
|
38
|
+
|
39
|
+
YQR:
|
40
|
+
- "On a turboprop to Regina"
|
41
|
+
|
42
|
+
YQT:
|
43
|
+
- "Landed in Thunder Bay"
|
44
|
+
|
45
|
+
YVR:
|
46
|
+
- "Flying to Vancouver tomorrow for the Winter Olympics."
|
47
|
+
|
48
|
+
YWG:
|
49
|
+
- "Landing in Winnipeg."
|
50
|
+
|
51
|
+
YYZ:
|
52
|
+
- "Good times flying from YYZ to SFO today. Why is our security so reactive? Fear? If terror was Umar Farouk's goal, then Mission Accomplished."
|
53
|
+
- "flying back to Toronto this morning. Sad"
|
54
|
+
|
55
|
+
# YXU:
|
56
|
+
# city: "London, ON"
|
57
|
+
# match_priority: 100
|
58
|
+
# matchers:
|
59
|
+
# - "London, ON"
|
60
|
+
# - "London Ontario"
|
61
|
+
# - "London, Ontario"
|
62
|
+
|
63
|
+
# YXY:
|
64
|
+
# city: "Whitehorse, YT"
|
65
|
+
# matchers:
|
66
|
+
# - "Whitehorse"
|
67
|
+
#
|
68
|
+
# YYC:
|
69
|
+
# city: Calgary
|
70
|
+
# county: ca
|
71
|
+
# matchers:
|
72
|
+
# - Calgary
|
73
|
+
#
|
74
|
+
# YYG:
|
75
|
+
# city: "Charlottetown, PEI"
|
76
|
+
# country: ca
|
77
|
+
# matchers:
|
78
|
+
# - "Charlottetown"
|
79
|
+
# - "Prince Edward Island"
|
80
|
+
# - "P.E.I."
|
81
|
+
#
|
82
|
+
# YYJ:
|
83
|
+
# city: "Victoria, BC"
|
84
|
+
# matchers:
|
85
|
+
# - "Victoria"
|
86
|
+
#
|
87
|
+
# YYT:
|
88
|
+
# city: "Saint Johns, NL"
|
89
|
+
# matchers:
|
90
|
+
# - "Saint Johns"
|
91
|
+
# - "St. Johns"
|
92
|
+
# - "St.Johns"
|
93
|
+
#
|
94
|
+
# YYZ:
|
95
|
+
# city: Toronto
|
96
|
+
# matchers:
|
97
|
+
# - "Toronto"
|
98
|
+
#
|
99
|
+
# YZF:
|
100
|
+
# city: "Yellowknife, NT"
|
101
|
+
# matchers:
|
102
|
+
# - "Yellowknife"
|
data/test/helper.rb
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
CA_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "ca_airports_tests.yml"))
|
5
|
+
US_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "us_airports_tests.yml"))
|
6
|
+
INTL_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "intl_airports_tests.yml"))
|
7
|
+
BAD_MATCHES = YAML.load_file(File.join(File.dirname(__FILE__), "bad_matches.yml"))
|
8
|
+
|
9
|
+
class TestAirportScraper < Test::Unit::TestCase
|
10
|
+
context "new" do
|
11
|
+
setup do
|
12
|
+
@scrape = AirportScraper.new
|
13
|
+
end
|
14
|
+
|
15
|
+
should "load the airports.yml file into @airports" do
|
16
|
+
airports = @scrape.airports
|
17
|
+
assert_not_nil airports
|
18
|
+
assert_not_nil airports['JFK'], "Didn't find JFK in airports"
|
19
|
+
end
|
20
|
+
|
21
|
+
should_eventually "create an @code_match_regex to match 3-letter codes" do
|
22
|
+
code_regex = @scrape.instance_variable_get("@code_match_regex")
|
23
|
+
assert_not_nil(code_regex)
|
24
|
+
assert_match(code_regex, 'JFK')
|
25
|
+
assert_no_match(code_regex, 'JFKX')
|
26
|
+
assert_no_match(code_regex, 'jfk')
|
27
|
+
end
|
28
|
+
|
29
|
+
should_eventually "create an @airport_regex" do
|
30
|
+
name_regex = @scrape.instance_variable_get("@airport_regex")
|
31
|
+
assert_not_nil(name_regex)
|
32
|
+
assert_match(name_regex, "Heathrow")
|
33
|
+
assert_match(name_regex, "heathrow")
|
34
|
+
assert_no_match(name_regex, "HeathrowX")
|
35
|
+
end
|
36
|
+
|
37
|
+
should "create an @matcher_prefixes array" do
|
38
|
+
by_priority = @scrape.instance_variable_get("@matcher_prefixes")
|
39
|
+
assert_not_nil(by_priority)
|
40
|
+
end
|
41
|
+
|
42
|
+
should "order @matcher_prefixes values in descending match_priority order" do
|
43
|
+
# Check that PWM comes before PDX
|
44
|
+
pref = @scrape.instance_variable_get("@matcher_prefixes")
|
45
|
+
by_priority = pref[@scrape.prefix_from_match("Portland")]
|
46
|
+
|
47
|
+
assert_not_nil by_priority
|
48
|
+
pdx = by_priority.detect {|x| x['code'] == 'PDX'}
|
49
|
+
pwm = by_priority.detect {|x| x['code'] == 'PWM'}
|
50
|
+
|
51
|
+
pdx_idx = by_priority.index(pdx)
|
52
|
+
pwm_idx = by_priority.index(pwm)
|
53
|
+
assert_not_nil pdx_idx
|
54
|
+
assert_not_nil pwm_idx
|
55
|
+
|
56
|
+
assert(pwm_idx < pdx_idx)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "possible_flight?" do
|
61
|
+
setup do
|
62
|
+
@scrape = AirportScraper.new
|
63
|
+
end
|
64
|
+
|
65
|
+
["on a flight to Rome", "flying to SFO", "just touched down in Vegas", "EWR to NYC", "EWR -> NYC"].each do |phrase|
|
66
|
+
should "return true for the phrase '#{phrase}'" do
|
67
|
+
assert @scrape.possible_flight?(phrase)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "extract_airports" do
|
73
|
+
setup do
|
74
|
+
@scrape = AirportScraper.new
|
75
|
+
end
|
76
|
+
|
77
|
+
context "when there are no airports in the text" do
|
78
|
+
should "return an empty_array" do
|
79
|
+
assert_equal [], @scrape.extract_airports("Twas brillig and the slithy toves")
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "Airport code tests" do
|
84
|
+
setup do
|
85
|
+
@scrape = AirportScraper.new
|
86
|
+
end
|
87
|
+
|
88
|
+
should_eventually "be able to match the airport codes" do
|
89
|
+
@scrape.airports.each do |airport|
|
90
|
+
assert_contains @scrape.extract_airports("Just landed in #{airport['code']}."), airport
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
context "Freeform name test" do
|
96
|
+
[US_TESTS, CA_TESTS, INTL_TESTS].each do |tests|
|
97
|
+
tests.keys.each do |code|
|
98
|
+
tests[code].each do |str|
|
99
|
+
should "return the airport #{code} for phrase '#{str}'" do
|
100
|
+
airport = @scrape.airport(code)
|
101
|
+
results = @scrape.extract_airports(str)
|
102
|
+
assert_contains results, airport, "Expected #{code}, returned #{results.map {|x| x['code']}.inspect }"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
context "Matchers" do
|
110
|
+
setup do
|
111
|
+
@scape = AirportScraper.new
|
112
|
+
end
|
113
|
+
|
114
|
+
should "not have duplicate matchers for two airports" do
|
115
|
+
matchers = {}
|
116
|
+
airports = @scrape.airports
|
117
|
+
|
118
|
+
airports.values.each do |airport|
|
119
|
+
airport['matchers'].each do |matcher|
|
120
|
+
if matchers[matcher].nil?
|
121
|
+
matchers[matcher] = airport
|
122
|
+
else
|
123
|
+
# if matchers[matcher]['code']['match_priority'] == airport['code']['match_priority']
|
124
|
+
flunk "Matcher '#{matcher}' for more than one airport (#{matchers[matcher]['code']}, #{airport['code']}) at same priority"
|
125
|
+
# end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
should_eventually "not have shorter matchers for a name with a match_priority greater than a longer variant"
|
132
|
+
end
|
133
|
+
|
134
|
+
context "Bad matches" do
|
135
|
+
BAD_MATCHES.each do |str|
|
136
|
+
should "not return any airports for phrase '#{str}'" do
|
137
|
+
results = @scrape.extract_airports(str)
|
138
|
+
assert_equal [], results, "Should not have matched anything, returned #{results.map {|x| x['code']}.inspect }"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
ABQ:
|
2
|
+
- "W'sup Dallas? 2 hr flight to albuquerque. There for two days and then to colorado to see the brothers for a week :)"
|
3
|
+
- "Jus landed in new mexico...my 1st time here"
|
4
|
+
- "just landed in ABQ, new mexico! hello coooold!"
|
5
|
+
|
6
|
+
BIL:
|
7
|
+
- "Making a quick run to SeaTac to drop off T@. She's flying to Billings but needs be in Bismark"
|
8
|
+
|
9
|
+
CLT:
|
10
|
+
- "Landed in Charlotte..."
|
11
|
+
|
12
|
+
CMH:
|
13
|
+
- "working a terrible shift, but flying to Columbus in the morning!"
|
14
|
+
|
15
|
+
DAL:
|
16
|
+
- "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
|
17
|
+
|
18
|
+
DEN:
|
19
|
+
- "Landed in Denver, now off to the terminal to await the arrival of my mother. I can't wait to consume mass amounts of turkey on Thursday."
|
20
|
+
|
21
|
+
DFW:
|
22
|
+
- "Just landed in Dallas"
|
23
|
+
|
24
|
+
EWR:
|
25
|
+
- "EWR Thanksgiving here we come. MIA to EWR to BDL"
|
26
|
+
- "just landed in NJ... its a gud look so far!!"
|
27
|
+
|
28
|
+
FLL:
|
29
|
+
- "Landed in Ft. Lauderdale. Driving to Boca."
|
30
|
+
|
31
|
+
JFK:
|
32
|
+
- "This little brat just spilled water on me during my 40 minute delayed flight to jfk. FML"
|
33
|
+
|
34
|
+
LAS:
|
35
|
+
- "Touched down in Vegas. Vegas baby!"
|
36
|
+
- "Landed in Vegas, watch out."
|
37
|
+
|
38
|
+
LAX:
|
39
|
+
- "Just landed in LA sans DJ equip, but its good to be back."
|
40
|
+
|
41
|
+
LGA:
|
42
|
+
- "kind of weird...Karlie Kloss was on my flight to LaGuardia today. haha I was tempted to get a picture with her, but she looked exhausted!"
|
43
|
+
|
44
|
+
LIT:
|
45
|
+
- "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
|
46
|
+
|
47
|
+
OAK:
|
48
|
+
- "flight to Oakland departs at 10:25. And I only just got on a shuttle from the parking lots. Here's hoping I don't miss my flight!"
|
49
|
+
|
50
|
+
PDX:
|
51
|
+
- "On my flight to Portland. Looking forward to kicking back at home."
|
52
|
+
- "Landing in PDX now. See you all soon."
|
53
|
+
|
54
|
+
PWM:
|
55
|
+
- "Landed in Portland ME"
|
56
|
+
|
57
|
+
RDU:
|
58
|
+
- "We just landed in Raleigh. Now its time to pack, go to church, and leave tonight to go to Mom's for Thanksgiving!"
|
59
|
+
|
60
|
+
SDF:
|
61
|
+
- "After leaving at 4:30am...just landed in Louisville. And yes, we arrived without a turkey smashing our windshield this year."
|
62
|
+
|
63
|
+
SFO:
|
64
|
+
- "@ San Francisco Airport waiting for my flight to Delhi via Beijing. Delayed by 3 hrs but knew that this morning thanks to Twitter."
|
65
|
+
|
66
|
+
SEA:
|
67
|
+
- "On the same Virgin America flight to Seattle with Top Chef's Marcel whose hair was perfectly peaked even at 9 am"
|
68
|
+
- "Alaska/Horizon FTW - on an earlier flight to SeaTac!"
|
69
|
+
|
70
|
+
STL:
|
71
|
+
- "Just landed in St.Louis, its raining outside smh I'm still going 2 enjoy my break"
|
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: airport_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jacob Harris
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-01 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: shoulda
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: A gem for extracting airport codes from text
|
36
|
+
email: jharris@nytimes.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.rdoc
|
44
|
+
files:
|
45
|
+
- .document
|
46
|
+
- .gitignore
|
47
|
+
- LICENSE
|
48
|
+
- README.rdoc
|
49
|
+
- Rakefile
|
50
|
+
- VERSION
|
51
|
+
- lib/airport_scraper.rb
|
52
|
+
- lib/ca_airports.yml
|
53
|
+
- lib/intl_airports.yml
|
54
|
+
- lib/us_airports.yml
|
55
|
+
- script/console
|
56
|
+
- test/bad_matches.yml
|
57
|
+
- test/ca_airports_tests.yml
|
58
|
+
- test/helper.rb
|
59
|
+
- test/intl_airports_tests.yml
|
60
|
+
- test/test_airport_scraper.rb
|
61
|
+
- test/us_airports_tests.yml
|
62
|
+
has_rdoc: true
|
63
|
+
homepage: http://github.com/harrisj/airport_scraper
|
64
|
+
licenses: []
|
65
|
+
|
66
|
+
post_install_message:
|
67
|
+
rdoc_options:
|
68
|
+
- --charset=UTF-8
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: "0"
|
82
|
+
version:
|
83
|
+
requirements: []
|
84
|
+
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 1.3.5
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: A gem for extracting airports from mentions in text
|
90
|
+
test_files:
|
91
|
+
- test/helper.rb
|
92
|
+
- test/test_airport_scraper.rb
|