airport_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/airport_scraper.rb'}"
9
+ puts "Loading airport_scraper gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
@@ -0,0 +1,10 @@
1
+ - "Taliban factions compete for credit in CIA bombing deaths - It was one of the worst blows ever to America's int.. http://bit.ly/8uo7l4"
2
+ - "@therealjuicyj #wtf does 'goin' ham' mean?! i hear you use it but we don't say that in toronto at all.."
3
+ - "Angelina Jolie and Brad Pitt take their kids to the Mary Poppins show on Broadway in NYC – January 3 â. http://bit.ly/6vcSl1"
4
+ - "back in san diego, ready to boogaaayy. :]"
5
+ - "In SLC Sky Club watching Jets game. http://myloc.me/2AFzh"
6
+ - "It's been real! See you in ATL shawty! RT @CEOTUFFLUV: it was fun! Last night in Buffalo!"
7
+ - "'What is Justin Beiber's birthday? Justin Bieber (real name Alex Lawrence) was born March 14th, 1994 in London.' -- LMAO! LOSERS!"
8
+ - "Going to climb into bed with a book. Work in Philly resumes tomorrow."
9
+ - "RT @ourpdx: Do you want a chance to win tickets for Xanadu's stage perf in PDX? Read this: http://bit.ly/8LnD70"
10
+
@@ -0,0 +1,102 @@
1
+ YCD:
2
+ - "On a flight to Nanaimo"
3
+
4
+ YEG:
5
+ - "Flying from NYC to YEG"
6
+ - "Landing in Edmonton now. Oh Canada!"
7
+ - "On the plane to Alberta"
8
+
9
+ YFC:
10
+ - "Touched down in Fredericton. Now to drive home."
11
+
12
+ YGK:
13
+ - "On the plane to Kingston Ontario"
14
+ - "Landing in Kingston, ON"
15
+ - "Flying from New York City to Kingston, Ontario"
16
+
17
+ YHZ:
18
+ - "Landed in Halifax. Now to find the Trailer Park Boys"
19
+
20
+ YKF:
21
+ - "On a jet to Kitchener."
22
+
23
+ YMQ:
24
+ - "Landing in Montreal next stop: Poutine"
25
+
26
+ YOW:
27
+ - "Just touched down in Ottawa"
28
+
29
+ YUL:
30
+ - "Landing at Montreal Dorval airport"
31
+ - "Landed at Dorval and ready to leave the plane"
32
+
33
+ YQB:
34
+ - "On a flight to Quebec City"
35
+
36
+ YQM:
37
+ - "Landing in Moncton, you probably don't know where this is"
38
+
39
+ YQR:
40
+ - "On a turboprop to Regina"
41
+
42
+ YQT:
43
+ - "Landed in Thunder Bay"
44
+
45
+ YVR:
46
+ - "Flying to Vancouver tomorrow for the Winter Olympics."
47
+
48
+ YWG:
49
+ - "Landing in Winnipeg."
50
+
51
+ YYZ:
52
+ - "Good times flying from YYZ to SFO today. Why is our security so reactive? Fear? If terror was Umar Farouk's goal, then Mission Accomplished."
53
+ - "flying back to Toronto this morning. Sad"
54
+
55
+ # YXU:
56
+ # city: "London, ON"
57
+ # match_priority: 100
58
+ # matchers:
59
+ # - "London, ON"
60
+ # - "London Ontario"
61
+ # - "London, Ontario"
62
+
63
+ # YXY:
64
+ # city: "Whitehorse, YT"
65
+ # matchers:
66
+ # - "Whitehorse"
67
+ #
68
+ # YYC:
69
+ # city: Calgary
70
+ # county: ca
71
+ # matchers:
72
+ # - Calgary
73
+ #
74
+ # YYG:
75
+ # city: "Charlottetown, PEI"
76
+ # country: ca
77
+ # matchers:
78
+ # - "Charlottetown"
79
+ # - "Prince Edward Island"
80
+ # - "P.E.I."
81
+ #
82
+ # YYJ:
83
+ # city: "Victoria, BC"
84
+ # matchers:
85
+ # - "Victoria"
86
+ #
87
+ # YYT:
88
+ # city: "Saint Johns, NL"
89
+ # matchers:
90
+ # - "Saint Johns"
91
+ # - "St. Johns"
92
+ # - "St.Johns"
93
+ #
94
+ # YYZ:
95
+ # city: Toronto
96
+ # matchers:
97
+ # - "Toronto"
98
+ #
99
+ # YZF:
100
+ # city: "Yellowknife, NT"
101
+ # matchers:
102
+ # - "Yellowknife"
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'airport_scraper'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,2 @@
1
+ AMS:
2
+ - "On board KLM Royal Dutch Airlines KL 3122 from PRG to AMS now #Flight"
@@ -0,0 +1,143 @@
1
+ require 'helper'
2
+ require 'yaml'
3
+
4
+ CA_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "ca_airports_tests.yml"))
5
+ US_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "us_airports_tests.yml"))
6
+ INTL_TESTS = YAML.load_file(File.join(File.dirname(__FILE__), "intl_airports_tests.yml"))
7
+ BAD_MATCHES = YAML.load_file(File.join(File.dirname(__FILE__), "bad_matches.yml"))
8
+
9
+ class TestAirportScraper < Test::Unit::TestCase
10
+ context "new" do
11
+ setup do
12
+ @scrape = AirportScraper.new
13
+ end
14
+
15
+ should "load the airports.yml file into @airports" do
16
+ airports = @scrape.airports
17
+ assert_not_nil airports
18
+ assert_not_nil airports['JFK'], "Didn't find JFK in airports"
19
+ end
20
+
21
+ should_eventually "create an @code_match_regex to match 3-letter codes" do
22
+ code_regex = @scrape.instance_variable_get("@code_match_regex")
23
+ assert_not_nil(code_regex)
24
+ assert_match(code_regex, 'JFK')
25
+ assert_no_match(code_regex, 'JFKX')
26
+ assert_no_match(code_regex, 'jfk')
27
+ end
28
+
29
+ should_eventually "create an @airport_regex" do
30
+ name_regex = @scrape.instance_variable_get("@airport_regex")
31
+ assert_not_nil(name_regex)
32
+ assert_match(name_regex, "Heathrow")
33
+ assert_match(name_regex, "heathrow")
34
+ assert_no_match(name_regex, "HeathrowX")
35
+ end
36
+
37
+ should "create an @matcher_prefixes array" do
38
+ by_priority = @scrape.instance_variable_get("@matcher_prefixes")
39
+ assert_not_nil(by_priority)
40
+ end
41
+
42
+ should "order @matcher_prefixes values in descending match_priority order" do
43
+ # Check that PWM comes before PDX
44
+ pref = @scrape.instance_variable_get("@matcher_prefixes")
45
+ by_priority = pref[@scrape.prefix_from_match("Portland")]
46
+
47
+ assert_not_nil by_priority
48
+ pdx = by_priority.detect {|x| x['code'] == 'PDX'}
49
+ pwm = by_priority.detect {|x| x['code'] == 'PWM'}
50
+
51
+ pdx_idx = by_priority.index(pdx)
52
+ pwm_idx = by_priority.index(pwm)
53
+ assert_not_nil pdx_idx
54
+ assert_not_nil pwm_idx
55
+
56
+ assert(pwm_idx < pdx_idx)
57
+ end
58
+ end
59
+
60
+ context "possible_flight?" do
61
+ setup do
62
+ @scrape = AirportScraper.new
63
+ end
64
+
65
+ ["on a flight to Rome", "flying to SFO", "just touched down in Vegas", "EWR to NYC", "EWR -> NYC"].each do |phrase|
66
+ should "return true for the phrase '#{phrase}'" do
67
+ assert @scrape.possible_flight?(phrase)
68
+ end
69
+ end
70
+ end
71
+
72
+ context "extract_airports" do
73
+ setup do
74
+ @scrape = AirportScraper.new
75
+ end
76
+
77
+ context "when there are no airports in the text" do
78
+ should "return an empty_array" do
79
+ assert_equal [], @scrape.extract_airports("Twas brillig and the slithy toves")
80
+ end
81
+ end
82
+
83
+ context "Airport code tests" do
84
+ setup do
85
+ @scrape = AirportScraper.new
86
+ end
87
+
88
+ should_eventually "be able to match the airport codes" do
89
+ @scrape.airports.each do |airport|
90
+ assert_contains @scrape.extract_airports("Just landed in #{airport['code']}."), airport
91
+ end
92
+ end
93
+ end
94
+
95
+ context "Freeform name test" do
96
+ [US_TESTS, CA_TESTS, INTL_TESTS].each do |tests|
97
+ tests.keys.each do |code|
98
+ tests[code].each do |str|
99
+ should "return the airport #{code} for phrase '#{str}'" do
100
+ airport = @scrape.airport(code)
101
+ results = @scrape.extract_airports(str)
102
+ assert_contains results, airport, "Expected #{code}, returned #{results.map {|x| x['code']}.inspect }"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ context "Matchers" do
110
+ setup do
111
+ @scape = AirportScraper.new
112
+ end
113
+
114
+ should "not have duplicate matchers for two airports" do
115
+ matchers = {}
116
+ airports = @scrape.airports
117
+
118
+ airports.values.each do |airport|
119
+ airport['matchers'].each do |matcher|
120
+ if matchers[matcher].nil?
121
+ matchers[matcher] = airport
122
+ else
123
+ # if matchers[matcher]['code']['match_priority'] == airport['code']['match_priority']
124
+ flunk "Matcher '#{matcher}' for more than one airport (#{matchers[matcher]['code']}, #{airport['code']}) at same priority"
125
+ # end
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ should_eventually "not have shorter matchers for a name with a match_priority greater than a longer variant"
132
+ end
133
+
134
+ context "Bad matches" do
135
+ BAD_MATCHES.each do |str|
136
+ should "not return any airports for phrase '#{str}'" do
137
+ results = @scrape.extract_airports(str)
138
+ assert_equal [], results, "Should not have matched anything, returned #{results.map {|x| x['code']}.inspect }"
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,71 @@
1
+ ABQ:
2
+ - "W'sup Dallas? 2 hr flight to albuquerque. There for two days and then to colorado to see the brothers for a week :)"
3
+ - "Jus landed in new mexico...my 1st time here"
4
+ - "just landed in ABQ, new mexico! hello coooold!"
5
+
6
+ BIL:
7
+ - "Making a quick run to SeaTac to drop off T@. She's flying to Billings but needs be in Bismark"
8
+
9
+ CLT:
10
+ - "Landed in Charlotte..."
11
+
12
+ CMH:
13
+ - "working a terrible shift, but flying to Columbus in the morning!"
14
+
15
+ DAL:
16
+ - "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
17
+
18
+ DEN:
19
+ - "Landed in Denver, now off to the terminal to await the arrival of my mother. I can't wait to consume mass amounts of turkey on Thursday."
20
+
21
+ DFW:
22
+ - "Just landed in Dallas"
23
+
24
+ EWR:
25
+ - "EWR Thanksgiving here we come. MIA to EWR to BDL"
26
+ - "just landed in NJ... its a gud look so far!!"
27
+
28
+ FLL:
29
+ - "Landed in Ft. Lauderdale. Driving to Boca."
30
+
31
+ JFK:
32
+ - "This little brat just spilled water on me during my 40 minute delayed flight to jfk. FML"
33
+
34
+ LAS:
35
+ - "Touched down in Vegas. Vegas baby!"
36
+ - "Landed in Vegas, watch out."
37
+
38
+ LAX:
39
+ - "Just landed in LA sans DJ equip, but its good to be back."
40
+
41
+ LGA:
42
+ - "kind of weird...Karlie Kloss was on my flight to LaGuardia today. haha I was tempted to get a picture with her, but she looked exhausted!"
43
+
44
+ LIT:
45
+ - "On @SouthwestAir #2992 heading from LIT to DAL and CEO Gary Kelly is on board!"
46
+
47
+ OAK:
48
+ - "flight to Oakland departs at 10:25. And I only just got on a shuttle from the parking lots. Here's hoping I don't miss my flight!"
49
+
50
+ PDX:
51
+ - "On my flight to Portland. Looking forward to kicking back at home."
52
+ - "Landing in PDX now. See you all soon."
53
+
54
+ PWM:
55
+ - "Landed in Portland ME"
56
+
57
+ RDU:
58
+ - "We just landed in Raleigh. Now its time to pack, go to church, and leave tonight to go to Mom's for Thanksgiving!"
59
+
60
+ SDF:
61
+ - "After leaving at 4:30am...just landed in Louisville. And yes, we arrived without a turkey smashing our windshield this year."
62
+
63
+ SFO:
64
+ - "@ San Francisco Airport waiting for my flight to Delhi via Beijing. Delayed by 3 hrs but knew that this morning thanks to Twitter."
65
+
66
+ SEA:
67
+ - "On the same Virgin America flight to Seattle with Top Chef's Marcel whose hair was perfectly peaked even at 9 am"
68
+ - "Alaska/Horizon FTW - on an earlier flight to SeaTac!"
69
+
70
+ STL:
71
+ - "Just landed in St.Louis, its raining outside smh I'm still going 2 enjoy my break"
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: airport_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jacob Harris
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-01 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: A gem for extracting airport codes from text
36
+ email: jharris@nytimes.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - .document
46
+ - .gitignore
47
+ - LICENSE
48
+ - README.rdoc
49
+ - Rakefile
50
+ - VERSION
51
+ - lib/airport_scraper.rb
52
+ - lib/ca_airports.yml
53
+ - lib/intl_airports.yml
54
+ - lib/us_airports.yml
55
+ - script/console
56
+ - test/bad_matches.yml
57
+ - test/ca_airports_tests.yml
58
+ - test/helper.rb
59
+ - test/intl_airports_tests.yml
60
+ - test/test_airport_scraper.rb
61
+ - test/us_airports_tests.yml
62
+ has_rdoc: true
63
+ homepage: http://github.com/harrisj/airport_scraper
64
+ licenses: []
65
+
66
+ post_install_message:
67
+ rdoc_options:
68
+ - --charset=UTF-8
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: "0"
82
+ version:
83
+ requirements: []
84
+
85
+ rubyforge_project:
86
+ rubygems_version: 1.3.5
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: A gem for extracting airports from mentions in text
90
+ test_files:
91
+ - test/helper.rb
92
+ - test/test_airport_scraper.rb