airport_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/airport_scraper.rb'}"
9
+ puts "Loading airport_scraper gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
@@ -0,0 +1,10 @@
1
+ - "Taliban factions compete for credit in CIA bombing deaths - It was one of the worst blows ever to America's int.. http://bit.ly/8uo7l4"
2
+ - "@therealjuicyj #wtf does 'goin' ham' mean?! i hear you use it but we don't say that in toronto at all.."
3
+ - "Angelina Jolie and Brad Pitt take their kids to the Mary Poppins show on Broadway in NYC – January 3 â. http://bit.ly/6vcSl1"
4
+ - "back in san diego, ready to boogaaayy. :]"
5
+ - "In SLC Sky Club watching Jets game. http://myloc.me/2AFzh"
6
+ - "It's been real! See you in ATL shawty! RT @CEOTUFFLUV: it was fun! Last night in Buffalo!"
7
+ - "'What is Justin Beiber's birthday? Justin Bieber (real name Alex Lawrence) was born March 14th, 1994 in London.' -- LMAO! LOSERS!"
8
+ - "Going to climb into bed with a book. Work in Philly resumes tomorrow."
9
+ - "RT @ourpdx: Do you want a chance to win tickets for Xanadu's stage perf in PDX? Read this: http://bit.ly/8LnD70"
10
+
@@ -0,0 +1,102 @@
1
+ YCD:
2
+ - "On a flight to Nanaimo"
3
+
4
+ YEG:
5
+ - "Flying from NYC to YEG"
6
+ - "Landing in Edmonton now. Oh Canada!"
7
+ - "On the plane to Alberta"
8
+
9
+ YFC:
10
+ - "Touched down in Fredericton. Now to drive home."
11
+
12
+ YGK:
13
+ - "On the plane to Kingston Ontario"
14
+ - "Landing in Kingston, ON"
15
+ - "Flying from New York City to Kingston, Ontario"
16
+
17
+ YHZ:
18
+ - "Landed in Halifax. Now to find the Trailer Park Boys"
19
+
20
+ YKF:
21
+ - "On a jet to Kitchener."
22
+
23
+ YMQ:
24
+ - "Landing in Montreal next stop: Poutine"
25
+
26
+ YOW:
27
+ - "Just touched down in Ottawa"
28
+
29
+ YUL:
30
+ - "Landing at Montreal Dorval airport"
31
+ - "Landed at Dorval and ready to leave the plane"
32
+
33
+ YQB:
34
+ - "On a flight to Quebec City"
35
+
36
+ YQM:
37
+ - "Landing in Moncton, you probably don't know where this is"
38
+
39
+ YQR:
40
+ - "On a turboprop to Regina"
41
+
42
+ YQT:
43
+ - "Landed in Thunder Bay"
44
+
45
+ YVR:
46
+ - "Flying to Vancouver tomorrow for the Winter Olympics."
47
+
48
+ YWG:
49
+ - "Landing in Winnipeg."
50
+
51
+ YYZ:
52
+ - "Good times flying from YYZ to SFO today. Why is our security so reactive? Fear? If terror was Umar Farouk's goal, then Mission Accomplished."
53
+ - "flying back to Toronto this morning. Sad"
54
+
55
+ # YXU:
56
+ # city: "London, ON"
57
+ # match_priority: 100
58
+ # matchers:
59
+ # - "London, ON"
60
+ # - "London Ontario"
61
+ # - "London, Ontario"
62
+
63
+ # YXY:
64
+ # city: "Whitehorse, YT"
65
+ # matchers:
66
+ # - "Whitehorse"
67
+ #
68
+ # YYC:
69
+ # city: Calgary
70
+ # county: ca
71
+ # matchers:
72
+ # - Calgary
73
+ #
74
+ # YYG:
75
+ # city: "Charlottetown, PEI"
76
+ # country: ca
77
+ # matchers:
78
+ # - "Charlottetown"
79
+ # - "Prince Edward Island"
80
+ # - "P.E.I."
81
+ #
82
+ # YYJ:
83
+ # city: "Victoria, BC"
84
+ # matchers:
85
+ # - "Victoria"
86
+ #
87
+ # YYT:
88
+ # city: "Saint Johns, NL"
89
+ # matchers:
90
+ # - "Saint Johns"
91
+ # - "St. Johns"
92
+ # - "St.Johns"
93
+ #
94
+ # YYZ:
95
+ # city: Toronto
96
+ # matchers:
97
+ # - "Toronto"
98
+ #
99
+ # YZF:
100
+ # city: "Yellowknife, NT"
101
+ # matchers:
102
+ # - "Yellowknife"
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'airport_scraper'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,2 @@
1
+ AMS:
2
+ - "On board KLM Royal Dutch Airlines KL 3122 from PRG to AMS now #Flight"
@@ -0,0 +1,143 @@
1
+ require 'helper'
2
+ require 'yaml'
3
+
4
+ CA_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "ca_airports_tests.yml"))
5
+ US_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "us_airports_tests.yml"))
6
+ INTL_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "intl_airports_tests.yml"))
7
+ BAD_MATCHES = YAML.load_file(File.join(File.dirname(__FILE__), "bad_matches.yml"))
8
+
9
+ class TestAirportScraper < Test::Unit::TestCase
10
+ context "new" do
11
+ setup do
12
+ @scrape = AirportScraper.new
13
+ end
14
+
15
+ should "load the airports.yml file into @airports" do
16
+ airports = @scrape.airports
17
+ assert_not_nil airports
18
+ assert_not_nil airports['JFK'], "Didn't find JFK in airports"
19
+ end
20
+
21
+ should_eventually "create an @code_match_regex to match 3-letter codes" do
22
+ code_regex = @scrape.instance_variable_get("@code_match_regex")
23
+ assert_not_nil(code_regex)
24
+ assert_match(code_regex, 'JFK')
25
+ assert_no_match(code_regex, 'JFKX')
26
+ assert_no_match(code_regex, 'jfk')
27
+ end
28
+
29
+ should_eventually "create an @airport_regex" do
30
+ name_regex = @scrape.instance_variable_get("@airport_regex")
31
+ assert_not_nil(name_regex)
32
+ assert_match(name_regex, "Heathrow")
33
+ assert_match(name_regex, "heathrow")
34
+ assert_no_match(name_regex, "HeathrowX")
35
+ end
36
+
37
+ should "create an @matcher_prefixes array" do
38
+ by_priority = @scrape.instance_variable_get("@matcher_prefixes")
39
+ assert_not_nil(by_priority)
40
+ end
41
+
42
+ should "order @matcher_prefixes values in descending match_priority order" do
43
+ # Check that PWM comes before PDX
44
+ pref = @scrape.instance_variable_get("@matcher_prefixes")
45
+ by_priority = pref[@scrape.prefix_from_match("Portland")]
46
+
47
+ assert_not_nil by_priority
48
+ pdx = by_priority.detect {|x| x['code'] == 'PDX'}
49
+ pwm = by_priority.detect {|x| x['code'] == 'PWM'}
50
+
51
+ pdx_idx = by_priority.index(pdx)
52
+ pwm_idx = by_priority.index(pwm)
53
+ assert_not_nil pdx_idx
54
+ assert_not_nil pwm_idx
55
+
56
+ assert(pwm_idx < pdx_idx)
57
+ end
58
+ end
59
+
60
+ context "possible_flight?" do
61
+ setup do
62
+ @scrape = AirportScraper.new
63
+ end
64
+
65
+ ["on a flight to Rome", "flying to SFO", "just touched down in Vegas", "EWR to NYC", "EWR -> NYC"].each do |phrase|
66
+ should "return true for the phrase '#{phrase}'" do
67
+ assert @scrape.possible_flight?(phrase)
68
+ end
69
+ end
70
+ end
71
+
72
+ context "extract_airports" do
73
+ setup do
74
+ @scrape = AirportScraper.new
75
+ end
76
+
77
+ context "when there are no airports in the text" do
78
+ should "return an empty_array" do
79
+ assert_equal [], @scrape.extract_airports("Twas brillig and the slithy toves")
80
+ end
81
+ end
82
+
83
+ context "Airport code tests" do
84
+ setup do
85
+ @scrape = AirportScraper.new
86
+ end
87
+
88
+ should_eventually "be able to match the airport codes" do
89
+ @scrape.airports.each do |airport|
90
+ assert_contains @scrape.extract_airports("Just landed in #{airport['code']}."), airport
91
+ end
92
+ end
93
+ end
94
+
95
+ context "Freeform name test" do
96
+ [US_TESTS, CA_TESTS, INTL_TESTS].each do |tests|
97
+ tests.keys.each do |code|
98
+ tests[code].each do |str|
99
+ should "return the airport #{code} for phrase '#{str}'" do
100
+ airport = @scrape.airport(code)
101
+ results = @scrape.extract_airports(str)
102
+ assert_contains results, airport, "Expected #{code}, returned #{results.map {|x| x['code']}.inspect }"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ context "Matchers" do
110
+ setup do
111
+ @scape = AirportScraper.new
112
+ end
113
+
114
+ should "not have duplicate matchers for two airports" do
115
+ matchers = {}
116
+ airports = @scrape.airports
117
+
118
+ airports.values.each do |airport|
119
+ airport['matchers'].each do |matcher|
120
+ if matchers[matcher].nil?
121
+ matchers[matcher] = airport
122
+ else
123
+ # if matchers[matcher]['code']['match_priority'] == airport['code']['match_priority']
124
+ flunk "Matcher '#{matcher}' for more than one airport (#{matchers[matcher]['code']}, #{airport['code']}) at same priority"
125
+ # end
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ should_eventually "not have shorter matchers for a name with a match_priority greater than a longer variant"
132
+ end
133
+
134
+ context "Bad matches" do
135
+ BAD_MATCHES.each do |str|
136
+ should "not return any airports for phrase '#{str}'" do
137
+ results = @scrape.extract_airports(str)
138
+ assert_equal [], results, "Should not have matched anything, returned #{results.map {|x| x['code']}.inspect }"
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,71 @@
1
+ ABQ:
2
+ - "W'sup Dallas? 2 hr flight to albuquerque. There for two days and then to colorado to see the brothers for a week :)"
3
+ - "Jus landed in new mexico...my 1st time here"
4
+ - "just landed in ABQ, new mexico! hello coooold!"
5
+
6
+ BIL:
7
+ - "Making a quick run to SeaTac to drop off T@. She's flying to Billings but needs be in Bismark"
8
+
9
+ CLT:
10
+ - "Landed in Charlotte..."
11
+
12
+ CMH:
13
+ - "working a terrible shift, but flying to Columbus in the morning!"
14
+
15
+ DAL:
16
+ - "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
17
+
18
+ DEN:
19
+ - "Landed in Denver, now off to the terminal to await the arrival of my mother. I can't wait to consume mass amounts of turkey on Thursday."
20
+
21
+ DFW:
22
+ - "Just landed in Dallas"
23
+
24
+ EWR:
25
+ - "EWR Thanksgiving here we come. MIA to EWR to BDL"
26
+ - "just landed in NJ... its a gud look so far!!"
27
+
28
+ FLL:
29
+ - "Landed in Ft. Lauderdale. Driving to Boca."
30
+
31
+ JFK:
32
+ - "This little brat just spilled water on me during my 40 minute delayed flight to jfk. FML"
33
+
34
+ LAS:
35
+ - "Touched down in Vegas. Vegas baby!"
36
+ - "Landed in Vegas, watch out."
37
+
38
+ LAX:
39
+ - "Just landed in LA sans DJ equip, but its good to be back."
40
+
41
+ LGA:
42
+ - "kind of weird...Karlie Kloss was on my flight to LaGuardia today. haha I was tempted to get a picture with her, but she looked exhausted!"
43
+
44
+ LIT:
45
+ - "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
46
+
47
+ OAK:
48
+ - "flight to Oakland departs at 10:25. And I only just got on a shuttle from the parking lots. Here's hoping I don't miss my flight!"
49
+
50
+ PDX:
51
+ - "On my flight to Portland. Looking forward to kicking back at home."
52
+ - "Landing in PDX now. See you all soon."
53
+
54
+ PWM:
55
+ - "Landed in Portland ME"
56
+
57
+ RDU:
58
+ - "We just landed in Raleigh. Now its time to pack, go to church, and leave tonight to go to Mom's for Thanksgiving!"
59
+
60
+ SDF:
61
+ - "After leaving at 4:30am...just landed in Louisville. And yes, we arrived without a turkey smashing our windshield this year."
62
+
63
+ SFO:
64
+ - "@ San Francisco Airport waiting for my flight to Delhi via Beijing. Delayed by 3 hrs but knew that this morning thanks to Twitter."
65
+
66
+ SEA:
67
+ - "On the same Virgin America flight to Seattle with Top Chef's Marcel whose hair was perfectly peaked even at 9 am"
68
+ - "Alaska/Horizon FTW - on an earlier flight to SeaTac!"
69
+
70
+ STL:
71
+ - "Just landed in St.Louis, its raining outside smh I'm still going 2 enjoy my break"
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: airport_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jacob Harris
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-01 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: A gem for extracting airport codes from text
36
+ email: jharris@nytimes.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - .document
46
+ - .gitignore
47
+ - LICENSE
48
+ - README.rdoc
49
+ - Rakefile
50
+ - VERSION
51
+ - lib/airport_scraper.rb
52
+ - lib/ca_airports.yml
53
+ - lib/intl_airports.yml
54
+ - lib/us_airports.yml
55
+ - script/console
56
+ - test/bad_matches.yml
57
+ - test/ca_airports_tests.yml
58
+ - test/helper.rb
59
+ - test/intl_airports_tests.yml
60
+ - test/test_airport_scraper.rb
61
+ - test/us_airports_tests.yml
62
+ has_rdoc: true
63
+ homepage: http://github.com/harrisj/airport_scraper
64
+ licenses: []
65
+
66
+ post_install_message:
67
+ rdoc_options:
68
+ - --charset=UTF-8
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: "0"
82
+ version:
83
+ requirements: []
84
+
85
+ rubyforge_project:
86
+ rubygems_version: 1.3.5
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: A gem for extracting airports from mentions in text
90
+ test_files:
91
+ - test/helper.rb
92
+ - test/test_airport_scraper.rb