translink 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -1,6 +1,7 @@
1
1
  module Translink
2
2
  class CLI
3
- RUNNABLE = ['help', 'scrape']
3
+ RUNNABLE = ['help', 'scrape', 'version']
4
+ URL = 'http://jp.translink.com.au/travel-information/network-information/buses/all-timetables'
4
5
 
5
6
  attr_accessor :out, :pwd, :__crawler__
6
7
 
@@ -23,23 +24,48 @@ module Translink
23
24
 
24
25
  def help input
25
26
  tomorrow = Date.today + 1
26
- log 'Usage: translink scrape <DATE> [URI]'
27
+ log 'Usage: translink scrape <DATE> [DB_PATH] [FROM_ROUTE_URL] [STEP]'
28
+ log ' translink version'
27
29
  log ''
28
30
  log 'Examples:'
29
31
  log " translink scrape #{tomorrow}"
30
- log " translink scrape #{tomorrow} sqlite://~/Desktop/#{tomorrow}.sqlite3"
32
+ log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3"
33
+ log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3 http://jp.translink.com.au/travel-information/network-information/buses/435"
34
+ log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3 http://jp.translink.com.au/travel-information/network-information/buses/435/#{tomorrow} 0"
31
35
  end
32
36
 
33
37
  def scrape input
34
- return help nil unless input =~ /^(\d{4}-\d{2}-\d{2})(\s+--uri="?(.+)"?)?$/
35
- date = Date.parse $1
36
- uri = $3 || 'sqlite://' + File.join(pwd, "#{date}.sqlite3")
37
- DB.context uri, :migrate => true do
38
- crawler = __crawler__.new 'http://jp.translink.com.au/travel-information/network-information/buses/all-timetables'
39
- crawler.crawl date
38
+ args = (input || '').split /\s/
39
+ case args.size
40
+ when 1
41
+ date = Date.parse args[0]
42
+ db_path = File.join(pwd, "#{date}.sqlite3")
43
+ when 2
44
+ date = Date.parse args[0]
45
+ db_path = File.expand_path args[1]
46
+ when 3
47
+ date = Date.parse args[0]
48
+ db_path = File.expand_path args[1]
49
+ from_route_url = URI.parse args[2]
50
+ when 4
51
+ date = Date.parse args[0]
52
+ db_path = File.expand_path args[1]
53
+ from_route_url = URI.parse args[2]
54
+ step = args[3].to_i
55
+ else
56
+ help nil
57
+ return
58
+ end
59
+ DB.context "sqlite://#{db_path}", :migrate => !File.exists?(db_path) do
60
+ crawler = __crawler__.new URL
61
+ crawler.crawl date, from_route_url, step
40
62
  end
41
63
  end
42
64
 
65
+ def version input
66
+ log VERSION
67
+ end
68
+
43
69
  def log message
44
70
  out.puts message
45
71
  end
@@ -1,20 +1,53 @@
1
1
  module Translink
2
2
  class Crawler
3
- attr_reader :url
3
+ MAX_RETRY_COUNT = 2 # Maximum number of times to attempt a HTTP request.
4
+ SLEEP_DURATION = 5 # Base amount of time to sleep in seconds before retrying.
5
+
6
+ attr_accessor :out
7
+ attr_reader :url
4
8
 
5
9
  def initialize url
6
10
  @url = URI.parse url
11
+ @out = $stdout
7
12
  end
8
13
 
9
- def crawl date
14
+ def crawl date, from_route_url = nil, step = nil
10
15
  timetable_page = Page::Timetable.new(url.to_s).timetable_page date
11
- timetable_page.route_pages.each do |route_page|
12
- route_model = Model::Route.find_or_add_route_from_route_page route_page
13
- route_page.trip_pages.each do |trip_page|
14
- trip_model = route_model.add_trip_from_trip_page trip_page
15
- trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
16
- end
16
+ timetable_page.route_pages(from_route_url, step).each do |route_page|
17
+ crawl_route_page route_page
18
+ end
19
+ end
20
+
21
+ def crawl_route_page route_page, retry_count = 0
22
+ route_model = Model::Route.find_or_add_route_from_route_page route_page
23
+ route_page.trip_pages.each do |trip_page|
24
+ crawl_trip_page route_model, trip_page
25
+ end
26
+ rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
27
+ if retry_count <= MAX_RETRY_COUNT
28
+ sleep SLEEP_DURATION * retry_count
29
+ crawl_route_page route_page, retry_count + 1
30
+ else
31
+ out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
32
+ end
33
+ rescue => exception
34
+ out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
35
+ out.puts exception.backtrace
36
+ end
37
+
38
+ def crawl_trip_page route_model, trip_page, retry_count = 0
39
+ trip_model = route_model.add_trip_from_trip_page trip_page
40
+ trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
41
+ rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
42
+ if retry_count <= MAX_RETRY_COUNT
43
+ sleep SLEEP_DURATION * retry_count
44
+ crawl_trip_page route_model, trip_page, retry_count + 1
45
+ else
46
+ out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
17
47
  end
48
+ rescue => exception
49
+ out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
50
+ out.puts exception.backtrace
18
51
  end
19
52
  end
20
53
  end
@@ -2,11 +2,55 @@ module Translink
2
2
  module DB
3
3
  def self.context uri, options = {}
4
4
  DataMapper.setup :default, uri
5
- DataMapper.repository :default do
5
+ DataMapper.repository :default do |repository|
6
6
  DataMapper.finalize
7
- DataMapper.auto_migrate! if options[:migrate]
7
+ repository.adapter.execute <<-SQL
8
+ PRAGMA foreign_keys=ON;
9
+ SQL
10
+ if options[:migrate]
11
+ repository.adapter.execute <<-SQL
12
+ DROP TABLE IF EXISTS "routes";
13
+ CREATE TABLE "routes" (
14
+ "route_id" TEXT NOT NULL PRIMARY KEY UNIQUE,
15
+ "short_name" TEXT NOT NULL,
16
+ "long_name" TEXT NOT NULL,
17
+ "route_type" INTEGER NOT NULL
18
+ );
19
+
20
+ DROP TABLE IF EXISTS "trips";
21
+ CREATE TABLE "trips" (
22
+ "trip_id" INTEGER NOT NULL PRIMARY KEY UNIQUE,
23
+ "direction" INTEGER NOT NULL,
24
+ "headsign" TEXT NOT NULL,
25
+ "route_id" TEXT NOT NULL,
26
+ FOREIGN KEY ("route_id") REFERENCES "routes" ("route_id") ON DELETE CASCADE
27
+ );
28
+
29
+ CREATE INDEX "index_trips_on_route_id" ON "trips" ("route_id");
30
+
31
+ DROP TABLE IF EXISTS "stop_times";
32
+ CREATE TABLE "stop_times" (
33
+ "arrival_time" STRING NOT NULL,
34
+ "stop_sequence" INTEGER NOT NULL,
35
+ "stop_id" TEXT NOT NULL,
36
+ "trip_id" INTEGER NOT NULL,
37
+ PRIMARY KEY ("arrival_time", "stop_id", "trip_id"),
38
+ FOREIGN KEY ("stop_id") REFERENCES "stops" ("stop_id") ON DELETE RESTRICT,
39
+ FOREIGN KEY ("trip_id") REFERENCES "trips" ("trip_id") ON DELETE CASCADE
40
+ );
41
+
42
+ DROP TABLE IF EXISTS "stops";
43
+ CREATE TABLE "stops" (
44
+ "stop_id" TEXT NOT NULL PRIMARY KEY UNIQUE,
45
+ "stop_name" TEXT NOT NULL,
46
+ "stop_lat" REAL NOT NULL,
47
+ "stop_lon" REAL NOT NULL
48
+ );
49
+ SQL
50
+ end
8
51
  yield if block_given?
9
52
  end
10
53
  end
11
54
  end
12
55
  end
56
+
@@ -5,12 +5,19 @@ module Translink
5
5
 
6
6
  storage_names[:default] = 'routes'
7
7
 
8
- property :id, Serial
9
- property :short_name, String # Route code. Eg "130".
10
- property :long_name, String # Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
11
- property :route_type, Integer # Type of transporation. Eg "Bus".
8
+ # Primary key. Same as +short_name+ because that's the only unique ID we've got.
9
+ property :id, String, :field => 'route_id', :key => true, :unique => true, :unique_index => true
12
10
 
13
- has n, :trips
11
+ # Route code. Eg "130".
12
+ property :short_name, String
13
+
14
+ # Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
15
+ property :long_name, String
16
+
17
+ # Type of transporation. Eg "Bus".
18
+ property :route_type, Integer
19
+
20
+ has n, :trips, :child_key => [:route_id]
14
21
 
15
22
  # Route model for the given +route_page+. Will create the route if it
16
23
  # doesn't exist.
@@ -18,7 +25,8 @@ module Translink
18
25
  # @param route_pate [Page::Route] HTML page that represents the route.
19
26
  # @return [Model::Route] DataMapper record.
20
27
  def self.find_or_add_route_from_route_page route_page
21
- first_or_create :short_name => route_page.short_name,
28
+ first_or_create :id => route_page.route_id,
29
+ :short_name => route_page.short_name,
22
30
  :long_name => route_page.long_name,
23
31
  :route_type => route_page.route_type
24
32
  end
@@ -5,8 +5,7 @@ module Translink
5
5
 
6
6
  storage_names[:default] = 'stops'
7
7
 
8
- property :id, Serial
9
- property :stop_id, String
8
+ property :id, String, :field => 'stop_id', :key => true, :unique => true, :unique_index => true
10
9
  property :stop_name, String
11
10
  property :stop_lat, Float
12
11
  property :stop_lon, Float
@@ -19,7 +18,7 @@ module Translink
19
18
  # @param route_pate [Page::Route] HTML page representing the stop.
20
19
  # @return [Model::Stop] DataMapper record.
21
20
  def self.find_or_add_from_stop_page stop_page
22
- first_or_create :stop_id => stop_page.stop_id,
21
+ first_or_create :id => stop_page.stop_id,
23
22
  :stop_name => stop_page.stop_name,
24
23
  :stop_lat => stop_page.stop_lat,
25
24
  :stop_lon => stop_page.stop_lon
@@ -5,9 +5,10 @@ module Translink
5
5
 
6
6
  storage_names[:default] = 'stop_times'
7
7
 
8
- property :id, Serial
9
- property :arrival_time, String
8
+ property :arrival_time, String, :key => true
10
9
  property :stop_sequence, Integer
10
+ property :stop_id, String, :key => true
11
+ property :trip_id, Integer, :key => true
11
12
 
12
13
  belongs_to :stop
13
14
  belongs_to :trip
@@ -5,11 +5,14 @@ module Translink
5
5
 
6
6
  storage_names[:default] = 'trips'
7
7
 
8
- property :id, Serial
9
- property :direction, Integer # Travel in one direction (Regular) or the opposite (Goofy) direction.
10
- property :headsign, String # Name of the direction. Eg "Inbound".
11
- property :service_id, Integer # Service belongs to a trip. Assigned by Translink.
12
- property :trip_id, Integer # Unique ID assigned by Translink.
8
+ # Primary key. Unique ID assigned by Translink.
9
+ property :id, Serial, :field => 'trip_id'
10
+
11
+ # Travel in one direction (Regular) or the opposite (Goofy) direction.
12
+ property :direction, Integer
13
+
14
+ # Name of the direction. Eg "Inbound".
15
+ property :headsign, String
13
16
 
14
17
  belongs_to :route
15
18
 
@@ -40,10 +43,9 @@ module Translink
40
43
  # @param trip_page [Trip::Page] HTML page that represents the trip.
41
44
  # @return [void]
42
45
  def trip_page! trip_page
43
- self.direction = trip_page.direction
44
- self.headsign = trip_page.headsign
45
- self.service_id = trip_page.service_id
46
- self.trip_id = trip_page.trip_id
46
+ self.id = trip_page.trip_id
47
+ self.direction = trip_page.direction
48
+ self.headsign = trip_page.headsign
47
49
  end
48
50
  end
49
51
  end
@@ -1,20 +1,30 @@
1
1
  module Translink
2
2
  class Page
3
+ class UnexpectedParserError < StandardError
4
+ end
5
+
3
6
  USER_AGENT = "Mozilla/5.0 (Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink))"
4
-
7
+
5
8
  attr_accessor :agent, :page, :url
6
-
9
+
7
10
  def initialize url
8
11
  @agent = Mechanize.new.tap { |mechanize| mechanize.user_agent = USER_AGENT }
9
12
  @url = URI.parse url
10
13
  end
11
-
14
+
12
15
  def page
13
- @page ||= agent.get url.to_s
16
+ @page ||= begin
17
+ page = agent.get url.to_s
18
+ if page.instance_of? Mechanize::Page
19
+ page
20
+ else
21
+ raise UnexpectedParserError, "Expected instance of Mechanize::Page. Got #{page.class}"
22
+ end
23
+ end
14
24
  end
15
-
25
+
16
26
  protected
17
-
27
+
18
28
  def url_from_href href
19
29
  url.scheme + '://' + url.host + href
20
30
  end
@@ -16,11 +16,28 @@ module Translink
16
16
  @long_name = long_name
17
17
  end
18
18
 
19
+ # Get the route's unique ID assigned by Translink. This is the same
20
+ # as the +short_name+.
21
+ #
22
+ # @return [String]
23
+ def route_id
24
+ @route_id ||= page.search('div#headingBar h1').first.text.sub('Route ', '')
25
+ end
26
+
19
27
  # Gets the route's code.
20
28
  #
21
29
  # @return [String]
22
30
  def short_name
23
- page.search('div#headingBar h1').first.text.sub('Route ', '')
31
+ case route_id
32
+ when 'CGLD'
33
+ 'CityGlider'
34
+ when 'LOOP'
35
+ 'City Loop'
36
+ when 'SHLP'
37
+ 'Spring Hill City Loop'
38
+ else
39
+ route_id
40
+ end
24
41
  end
25
42
 
26
43
  # Get the date this route is running. Trip pages are bound by this
@@ -47,11 +64,33 @@ module Translink
47
64
  headsigns.index anchor.ancestors('div.route-timetable').search('h3').first.text.downcase
48
65
  end
49
66
 
67
+ # Get the date of the trip. If the trip does not have a date, the UNIX
68
+ # epoc is returned.
69
+ #
70
+ # Examples:
71
+ #
72
+ # "/travel-information/network-information/service-information/outbound/9792/2173523/2012-09-24"
73
+ # ... becomes
74
+ # DateTime.new('2012-09-24')
75
+ #
76
+ # "/travel-information/network-information/service-information/outbound/9792/2173523"
77
+ # ... becomes
78
+ # DateTime.new('1970-01-01')
79
+ #
80
+ # @return [DateTime]
81
+ def date_from_anchor anchor
82
+ match = anchor[:href].match /\d{4}-\d{2}-\d{2}$/
83
+ date = match ? match[0] : '1970-01-01'
84
+ DateTime.parse date
85
+ end
86
+
50
87
  # Builds an array of trip pages.
51
88
  #
52
89
  # @return [Array<Page::Trip>]
53
90
  def trip_pages
54
- page.search('a.map-link-top').map do |anchor|
91
+ page.search('a.map-link-top').select do |anchor|
92
+ date_from_anchor(anchor) == date
93
+ end.map do |anchor|
55
94
  Trip.new url_from_href(anchor[:href]), date, direction_from_anchor(anchor)
56
95
  end
57
96
  end
@@ -1,13 +1,20 @@
1
1
  module Translink
2
2
  class Page::Timetable < Page
3
- # Builds an array of route pages.
3
+ # Builds an unique array of route pages.
4
4
  #
5
+ # @param url [URI] Omit routes before the route with +url+.
5
6
  # @return [Array<Page::Route>]
6
- def route_pages
7
- page.search('table tr td:last-child a').reduce Array.new do |pages, anchor|
8
- route = Route.new url_from_href(anchor['href']), anchor.text
9
- pages << route
10
- pages
7
+ def route_pages url = nil, step = nil
8
+ routes = page.search('table tr td:last-child a').reduce(Array.new) do |routes, anchor|
9
+ route = Route.new url_from_href(anchor['href']), anchor.text
10
+ duplicate = routes.find { |duplicate| duplicate.url == route.url }
11
+ routes << route unless duplicate
12
+ routes
13
+ end
14
+ if url
15
+ routes.drop_while { |route| route.url != url }.slice 0..(step || routes.size)
16
+ else
17
+ routes
11
18
  end
12
19
  end
13
20
 
@@ -35,13 +35,6 @@ module Translink
35
35
  attr_accessor :stop_page # [Page::Trip::Stop] Stop associated with the +arrival_time+.
36
36
  attr_accessor :stop_sequence # [Integer] Order in which this stop is visited in the trip.
37
37
 
38
- # Creates a new stop time.
39
- #
40
- # @param stop_sequence [Integer] Order in which this stop is visited.
41
- def initialize stop_sequence
42
- @stop_sequence = stop_sequence
43
- end
44
-
45
38
  # Time vehicle starts from the +stop+. Translink doesn't provide an
46
39
  # explicit +departure_time+ so we use the +arrival_time+.
47
40
  #
@@ -66,7 +59,7 @@ module Translink
66
59
  # or the opposite (Goofy) direction.
67
60
 
68
61
  # Creates a new trip.
69
- #
62
+ #
70
63
  # @param url [String] URL to fetch the page from.
71
64
  # @param date [Date] Date the trip runs on.
72
65
  def initialize url, date, direction
@@ -87,24 +80,25 @@ module Translink
87
80
  #
88
81
  # @return [String]
89
82
  def trip_id
90
- url.to_s =~ /information\/[a-z]+\/([^\/]+)/
91
- $1
92
- end
93
-
94
- # Get the trip's service ID.
95
- #
96
- # @return [String]
97
- def service_id
98
83
  url.to_s =~ /information\/[a-z]+\/[^\/]+\/([^\/]+)/
99
84
  $1
100
85
  end
101
86
 
102
- # Builds an array of stop times.
87
+ # Builds an unique array of stop times.
103
88
  #
104
89
  # @return [Array<Page::Trip::StopTime>]
105
90
  def stop_times
106
- page.search('table#trip-details tbody tr').each_with_index.map do |table_row, index|
107
- StopTime.new(index).html! table_row
91
+ page.search('table#trip-details tbody tr').reduce(Array.new) do |stop_times, table_row|
92
+ stop_time = StopTime.new.html! table_row
93
+ duplicate = stop_times.find do |duplicate|
94
+ duplicate.stop_page.stop_id == stop_time.stop_page.stop_id &&
95
+ duplicate.arrival_time == stop_time.arrival_time
96
+ end
97
+ stop_times << stop_time unless duplicate
98
+ stop_times
99
+ end.each_with_index.map do |stop_time, index|
100
+ stop_time.stop_sequence = index
101
+ stop_time
108
102
  end
109
103
  end
110
104
  end