translink 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +18 -0
- data/README.md +26 -15
- data/doc/schema.graffle +67 -69
- data/doc/schema.png +0 -0
- data/lib/translink/cli.rb +35 -9
- data/lib/translink/crawler.rb +41 -8
- data/lib/translink/db.rb +46 -2
- data/lib/translink/model/route.rb +14 -6
- data/lib/translink/model/stop.rb +2 -3
- data/lib/translink/model/stop_time.rb +3 -2
- data/lib/translink/model/trip.rb +11 -9
- data/lib/translink/page.rb +16 -6
- data/lib/translink/page/route.rb +41 -2
- data/lib/translink/page/timetable.rb +13 -6
- data/lib/translink/page/trip.rb +13 -19
- data/lib/translink/version.rb +1 -1
- data/test/fixtures/sample/route.html +1 -1
- data/test/fixtures/verbatim/route.html +294 -270
- data/test/fixtures/verbatim/route/date_from_anchor.html +2727 -0
- data/test/fixtures/verbatim/timetable/duplicate_routes.html +2293 -0
- data/test/fixtures/verbatim/trip/duplicate_stop_times.html +589 -0
- data/test/unit/cli_test.rb +26 -7
- data/test/unit/model/route_test.rb +3 -2
- data/test/unit/model/trip_test.rb +3 -3
- data/test/unit/page/route_test.rb +36 -5
- data/test/unit/page/timetable_test.rb +29 -0
- data/test/unit/page/trip_test.rb +12 -5
- data/translink.gemspec +3 -3
- metadata +62 -21
data/doc/schema.png
CHANGED
Binary file
|
data/lib/translink/cli.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Translink
|
2
2
|
class CLI
|
3
|
-
RUNNABLE = ['help', 'scrape']
|
3
|
+
RUNNABLE = ['help', 'scrape', 'version']
|
4
|
+
URL = 'http://jp.translink.com.au/travel-information/network-information/buses/all-timetables'
|
4
5
|
|
5
6
|
attr_accessor :out, :pwd, :__crawler__
|
6
7
|
|
@@ -23,23 +24,48 @@ module Translink
|
|
23
24
|
|
24
25
|
def help input
|
25
26
|
tomorrow = Date.today + 1
|
26
|
-
log 'Usage: translink scrape <DATE> [
|
27
|
+
log 'Usage: translink scrape <DATE> [DB_PATH] [FROM_ROUTE_URL] [STEP]'
|
28
|
+
log ' translink version'
|
27
29
|
log ''
|
28
30
|
log 'Examples:'
|
29
31
|
log " translink scrape #{tomorrow}"
|
30
|
-
log " translink scrape #{tomorrow}
|
32
|
+
log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3"
|
33
|
+
log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3 http://jp.translink.com.au/travel-information/network-information/buses/435"
|
34
|
+
log " translink scrape #{tomorrow} ~/Desktop/#{tomorrow}.sqlite3 http://jp.translink.com.au/travel-information/network-information/buses/435/#{tomorrow} 0"
|
31
35
|
end
|
32
36
|
|
33
37
|
def scrape input
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
args = (input || '').split /\s/
|
39
|
+
case args.size
|
40
|
+
when 1
|
41
|
+
date = Date.parse args[0]
|
42
|
+
db_path = File.join(pwd, "#{date}.sqlite3")
|
43
|
+
when 2
|
44
|
+
date = Date.parse args[0]
|
45
|
+
db_path = File.expand_path args[1]
|
46
|
+
when 3
|
47
|
+
date = Date.parse args[0]
|
48
|
+
db_path = File.expand_path args[1]
|
49
|
+
from_route_url = URI.parse args[2]
|
50
|
+
when 4
|
51
|
+
date = Date.parse args[0]
|
52
|
+
db_path = File.expand_path args[1]
|
53
|
+
from_route_url = URI.parse args[2]
|
54
|
+
step = args[3].to_i
|
55
|
+
else
|
56
|
+
help nil
|
57
|
+
return
|
58
|
+
end
|
59
|
+
DB.context "sqlite://#{db_path}", :migrate => !File.exists?(db_path) do
|
60
|
+
crawler = __crawler__.new URL
|
61
|
+
crawler.crawl date, from_route_url, step
|
40
62
|
end
|
41
63
|
end
|
42
64
|
|
65
|
+
def version input
|
66
|
+
log VERSION
|
67
|
+
end
|
68
|
+
|
43
69
|
def log message
|
44
70
|
out.puts message
|
45
71
|
end
|
data/lib/translink/crawler.rb
CHANGED
@@ -1,20 +1,53 @@
|
|
1
1
|
module Translink
|
2
2
|
class Crawler
|
3
|
-
|
3
|
+
MAX_RETRY_COUNT = 2 # Maximum number of times to attempt a HTTP request.
|
4
|
+
SLEEP_DURATION = 5 # Base amount of time to sleep in seconds before retrying.
|
5
|
+
|
6
|
+
attr_accessor :out
|
7
|
+
attr_reader :url
|
4
8
|
|
5
9
|
def initialize url
|
6
10
|
@url = URI.parse url
|
11
|
+
@out = $stdout
|
7
12
|
end
|
8
13
|
|
9
|
-
def crawl date
|
14
|
+
def crawl date, from_route_url = nil, step = nil
|
10
15
|
timetable_page = Page::Timetable.new(url.to_s).timetable_page date
|
11
|
-
timetable_page.route_pages.each do |route_page|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
timetable_page.route_pages(from_route_url, step).each do |route_page|
|
17
|
+
crawl_route_page route_page
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def crawl_route_page route_page, retry_count = 0
|
22
|
+
route_model = Model::Route.find_or_add_route_from_route_page route_page
|
23
|
+
route_page.trip_pages.each do |trip_page|
|
24
|
+
crawl_trip_page route_model, trip_page
|
25
|
+
end
|
26
|
+
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
|
27
|
+
if retry_count <= MAX_RETRY_COUNT
|
28
|
+
sleep SLEEP_DURATION * retry_count
|
29
|
+
crawl_route_page route_page, retry_count + 1
|
30
|
+
else
|
31
|
+
out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
|
32
|
+
end
|
33
|
+
rescue => exception
|
34
|
+
out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
|
35
|
+
out.puts exception.backtrace
|
36
|
+
end
|
37
|
+
|
38
|
+
def crawl_trip_page route_model, trip_page, retry_count = 0
|
39
|
+
trip_model = route_model.add_trip_from_trip_page trip_page
|
40
|
+
trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
|
41
|
+
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
|
42
|
+
if retry_count <= MAX_RETRY_COUNT
|
43
|
+
sleep SLEEP_DURATION * retry_count
|
44
|
+
crawl_trip_page route_model, trip_page, retry_count + 1
|
45
|
+
else
|
46
|
+
out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
|
17
47
|
end
|
48
|
+
rescue => exception
|
49
|
+
out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
|
50
|
+
out.puts exception.backtrace
|
18
51
|
end
|
19
52
|
end
|
20
53
|
end
|
data/lib/translink/db.rb
CHANGED
@@ -2,11 +2,55 @@ module Translink
|
|
2
2
|
module DB
|
3
3
|
def self.context uri, options = {}
|
4
4
|
DataMapper.setup :default, uri
|
5
|
-
DataMapper.repository :default do
|
5
|
+
DataMapper.repository :default do |repository|
|
6
6
|
DataMapper.finalize
|
7
|
-
|
7
|
+
repository.adapter.execute <<-SQL
|
8
|
+
PRAGMA foreign_keys=ON;
|
9
|
+
SQL
|
10
|
+
if options[:migrate]
|
11
|
+
repository.adapter.execute <<-SQL
|
12
|
+
DROP TABLE IF EXISTS "routes";
|
13
|
+
CREATE TABLE "routes" (
|
14
|
+
"route_id" TEXT NOT NULL PRIMARY KEY UNIQUE,
|
15
|
+
"short_name" TEXT NOT NULL,
|
16
|
+
"long_name" TEXT NOT NULL,
|
17
|
+
"route_type" INTEGER NOT NULL
|
18
|
+
);
|
19
|
+
|
20
|
+
DROP TABLE IF EXISTS "trips";
|
21
|
+
CREATE TABLE "trips" (
|
22
|
+
"trip_id" INTEGER NOT NULL PRIMARY KEY UNIQUE,
|
23
|
+
"direction" INTEGER NOT NULL,
|
24
|
+
"headsign" TEXT NOT NULL,
|
25
|
+
"route_id" TEXT NOT NULL,
|
26
|
+
FOREIGN KEY ("route_id") REFERENCES "routes" ("route_id") ON DELETE CASCADE
|
27
|
+
);
|
28
|
+
|
29
|
+
CREATE INDEX "index_trips_on_route_id" ON "trips" ("route_id");
|
30
|
+
|
31
|
+
DROP TABLE IF EXISTS "stop_times";
|
32
|
+
CREATE TABLE "stop_times" (
|
33
|
+
"arrival_time" STRING NOT NULL,
|
34
|
+
"stop_sequence" INTEGER NOT NULL,
|
35
|
+
"stop_id" TEXT NOT NULL,
|
36
|
+
"trip_id" INTEGER NOT NULL,
|
37
|
+
PRIMARY KEY ("arrival_time", "stop_id", "trip_id"),
|
38
|
+
FOREIGN KEY ("stop_id") REFERENCES "stops" ("stop_id") ON DELETE RESTRICT,
|
39
|
+
FOREIGN KEY ("trip_id") REFERENCES "trips" ("trip_id") ON DELETE CASCADE
|
40
|
+
);
|
41
|
+
|
42
|
+
DROP TABLE IF EXISTS "stops";
|
43
|
+
CREATE TABLE "stops" (
|
44
|
+
"stop_id" TEXT NOT NULL PRIMARY KEY UNIQUE,
|
45
|
+
"stop_name" TEXT NOT NULL,
|
46
|
+
"stop_lat" REAL NOT NULL,
|
47
|
+
"stop_lon" REAL NOT NULL
|
48
|
+
);
|
49
|
+
SQL
|
50
|
+
end
|
8
51
|
yield if block_given?
|
9
52
|
end
|
10
53
|
end
|
11
54
|
end
|
12
55
|
end
|
56
|
+
|
@@ -5,12 +5,19 @@ module Translink
|
|
5
5
|
|
6
6
|
storage_names[:default] = 'routes'
|
7
7
|
|
8
|
-
|
9
|
-
property :
|
10
|
-
property :long_name, String # Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
|
11
|
-
property :route_type, Integer # Type of transporation. Eg "Bus".
|
8
|
+
# Primary key. Same as +short_name+ because that's the only unique ID we've got.
|
9
|
+
property :id, String, :field => 'route_id', :key => true, :unique => true, :unique_index => true
|
12
10
|
|
13
|
-
|
11
|
+
# Route code. Eg "130".
|
12
|
+
property :short_name, String
|
13
|
+
|
14
|
+
# Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
|
15
|
+
property :long_name, String
|
16
|
+
|
17
|
+
# Type of transporation. Eg "Bus".
|
18
|
+
property :route_type, Integer
|
19
|
+
|
20
|
+
has n, :trips, :child_key => [:route_id]
|
14
21
|
|
15
22
|
# Route model for the given +route_page+. Will create the route if it
|
16
23
|
# doesn't exist.
|
@@ -18,7 +25,8 @@ module Translink
|
|
18
25
|
# @param route_pate [Page::Route] HTML page that represents the route.
|
19
26
|
# @return [Model::Route] DataMapper record.
|
20
27
|
def self.find_or_add_route_from_route_page route_page
|
21
|
-
first_or_create :
|
28
|
+
first_or_create :id => route_page.route_id,
|
29
|
+
:short_name => route_page.short_name,
|
22
30
|
:long_name => route_page.long_name,
|
23
31
|
:route_type => route_page.route_type
|
24
32
|
end
|
data/lib/translink/model/stop.rb
CHANGED
@@ -5,8 +5,7 @@ module Translink
|
|
5
5
|
|
6
6
|
storage_names[:default] = 'stops'
|
7
7
|
|
8
|
-
property :id,
|
9
|
-
property :stop_id, String
|
8
|
+
property :id, String, :field => 'stop_id', :key => true, :unique => true, :unique_index => true
|
10
9
|
property :stop_name, String
|
11
10
|
property :stop_lat, Float
|
12
11
|
property :stop_lon, Float
|
@@ -19,7 +18,7 @@ module Translink
|
|
19
18
|
# @param route_pate [Page::Route] HTML page representing the stop.
|
20
19
|
# @return [Model::Stop] DataMapper record.
|
21
20
|
def self.find_or_add_from_stop_page stop_page
|
22
|
-
first_or_create :
|
21
|
+
first_or_create :id => stop_page.stop_id,
|
23
22
|
:stop_name => stop_page.stop_name,
|
24
23
|
:stop_lat => stop_page.stop_lat,
|
25
24
|
:stop_lon => stop_page.stop_lon
|
@@ -5,9 +5,10 @@ module Translink
|
|
5
5
|
|
6
6
|
storage_names[:default] = 'stop_times'
|
7
7
|
|
8
|
-
property :
|
9
|
-
property :arrival_time, String
|
8
|
+
property :arrival_time, String, :key => true
|
10
9
|
property :stop_sequence, Integer
|
10
|
+
property :stop_id, String, :key => true
|
11
|
+
property :trip_id, Integer, :key => true
|
11
12
|
|
12
13
|
belongs_to :stop
|
13
14
|
belongs_to :trip
|
data/lib/translink/model/trip.rb
CHANGED
@@ -5,11 +5,14 @@ module Translink
|
|
5
5
|
|
6
6
|
storage_names[:default] = 'trips'
|
7
7
|
|
8
|
-
|
9
|
-
property :
|
10
|
-
|
11
|
-
|
12
|
-
property :
|
8
|
+
# Primary key. Unique ID assigned by Translink.
|
9
|
+
property :id, Serial, :field => 'trip_id'
|
10
|
+
|
11
|
+
# Travel in one direction (Regular) or the opposite (Goofy) direction.
|
12
|
+
property :direction, Integer
|
13
|
+
|
14
|
+
# Name of the direction. Eg "Inbound".
|
15
|
+
property :headsign, String
|
13
16
|
|
14
17
|
belongs_to :route
|
15
18
|
|
@@ -40,10 +43,9 @@ module Translink
|
|
40
43
|
# @param trip_page [Trip::Page] HTML page that represents the trip.
|
41
44
|
# @return [void]
|
42
45
|
def trip_page! trip_page
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
self.trip_id = trip_page.trip_id
|
46
|
+
self.id = trip_page.trip_id
|
47
|
+
self.direction = trip_page.direction
|
48
|
+
self.headsign = trip_page.headsign
|
47
49
|
end
|
48
50
|
end
|
49
51
|
end
|
data/lib/translink/page.rb
CHANGED
@@ -1,20 +1,30 @@
|
|
1
1
|
module Translink
|
2
2
|
class Page
|
3
|
+
class UnexpectedParserError < StandardError
|
4
|
+
end
|
5
|
+
|
3
6
|
USER_AGENT = "Mozilla/5.0 (Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink))"
|
4
|
-
|
7
|
+
|
5
8
|
attr_accessor :agent, :page, :url
|
6
|
-
|
9
|
+
|
7
10
|
def initialize url
|
8
11
|
@agent = Mechanize.new.tap { |mechanize| mechanize.user_agent = USER_AGENT }
|
9
12
|
@url = URI.parse url
|
10
13
|
end
|
11
|
-
|
14
|
+
|
12
15
|
def page
|
13
|
-
@page ||=
|
16
|
+
@page ||= begin
|
17
|
+
page = agent.get url.to_s
|
18
|
+
if page.instance_of? Mechanize::Page
|
19
|
+
page
|
20
|
+
else
|
21
|
+
raise UnexpectedParserError, "Expected instance of Mechanize::Page. Got #{page.class}"
|
22
|
+
end
|
23
|
+
end
|
14
24
|
end
|
15
|
-
|
25
|
+
|
16
26
|
protected
|
17
|
-
|
27
|
+
|
18
28
|
def url_from_href href
|
19
29
|
url.scheme + '://' + url.host + href
|
20
30
|
end
|
data/lib/translink/page/route.rb
CHANGED
@@ -16,11 +16,28 @@ module Translink
|
|
16
16
|
@long_name = long_name
|
17
17
|
end
|
18
18
|
|
19
|
+
# Get the route's unique ID assigned by Translink. This is the same
|
20
|
+
# as the +short_name+.
|
21
|
+
#
|
22
|
+
# @return [String]
|
23
|
+
def route_id
|
24
|
+
@route_id ||= page.search('div#headingBar h1').first.text.sub('Route ', '')
|
25
|
+
end
|
26
|
+
|
19
27
|
# Gets the route's code.
|
20
28
|
#
|
21
29
|
# @return [String]
|
22
30
|
def short_name
|
23
|
-
|
31
|
+
case route_id
|
32
|
+
when 'CGLD'
|
33
|
+
'CityGlider'
|
34
|
+
when 'LOOP'
|
35
|
+
'City Loop'
|
36
|
+
when 'SHLP'
|
37
|
+
'Spring Hill City Loop'
|
38
|
+
else
|
39
|
+
route_id
|
40
|
+
end
|
24
41
|
end
|
25
42
|
|
26
43
|
# Get the date this route is running. Trip pages are bound by this
|
@@ -47,11 +64,33 @@ module Translink
|
|
47
64
|
headsigns.index anchor.ancestors('div.route-timetable').search('h3').first.text.downcase
|
48
65
|
end
|
49
66
|
|
67
|
+
# Get the date of the trip. If the trip does not have a date, the UNIX
|
68
|
+
# epoc is returned.
|
69
|
+
#
|
70
|
+
# Examples:
|
71
|
+
#
|
72
|
+
# "/travel-information/network-information/service-information/outbound/9792/2173523/2012-09-24"
|
73
|
+
# ... becomes
|
74
|
+
# DateTime.new('2012-09-24')
|
75
|
+
#
|
76
|
+
# "/travel-information/network-information/service-information/outbound/9792/2173523"
|
77
|
+
# ... becomes
|
78
|
+
# DateTime.new('1970-01-01')
|
79
|
+
#
|
80
|
+
# @return [DateTime]
|
81
|
+
def date_from_anchor anchor
|
82
|
+
match = anchor[:href].match /\d{4}-\d{2}-\d{2}$/
|
83
|
+
date = match ? match[0] : '1970-01-01'
|
84
|
+
DateTime.parse date
|
85
|
+
end
|
86
|
+
|
50
87
|
# Builds an array of trip pages.
|
51
88
|
#
|
52
89
|
# @return [Array<Page::Trip>]
|
53
90
|
def trip_pages
|
54
|
-
page.search('a.map-link-top').
|
91
|
+
page.search('a.map-link-top').select do |anchor|
|
92
|
+
date_from_anchor(anchor) == date
|
93
|
+
end.map do |anchor|
|
55
94
|
Trip.new url_from_href(anchor[:href]), date, direction_from_anchor(anchor)
|
56
95
|
end
|
57
96
|
end
|
@@ -1,13 +1,20 @@
|
|
1
1
|
module Translink
|
2
2
|
class Page::Timetable < Page
|
3
|
-
# Builds an array of route pages.
|
3
|
+
# Builds an unique array of route pages.
|
4
4
|
#
|
5
|
+
# @param url [URI] Omit routes before the route with +url+.
|
5
6
|
# @return [Array<Page::Route>]
|
6
|
-
def route_pages
|
7
|
-
page.search('table tr td:last-child a').reduce
|
8
|
-
route
|
9
|
-
|
10
|
-
|
7
|
+
def route_pages url = nil, step = nil
|
8
|
+
routes = page.search('table tr td:last-child a').reduce(Array.new) do |routes, anchor|
|
9
|
+
route = Route.new url_from_href(anchor['href']), anchor.text
|
10
|
+
duplicate = routes.find { |duplicate| duplicate.url == route.url }
|
11
|
+
routes << route unless duplicate
|
12
|
+
routes
|
13
|
+
end
|
14
|
+
if url
|
15
|
+
routes.drop_while { |route| route.url != url }.slice 0..(step || routes.size)
|
16
|
+
else
|
17
|
+
routes
|
11
18
|
end
|
12
19
|
end
|
13
20
|
|
data/lib/translink/page/trip.rb
CHANGED
@@ -35,13 +35,6 @@ module Translink
|
|
35
35
|
attr_accessor :stop_page # [Page::Trip::Stop] Stop associated with the +arrival_time+.
|
36
36
|
attr_accessor :stop_sequence # [Integer] Order in which this stop is visited in the trip.
|
37
37
|
|
38
|
-
# Creates a new stop time.
|
39
|
-
#
|
40
|
-
# @param stop_sequence [Integer] Order in which this stop is visited.
|
41
|
-
def initialize stop_sequence
|
42
|
-
@stop_sequence = stop_sequence
|
43
|
-
end
|
44
|
-
|
45
38
|
# Time vehicle starts from the +stop+. Translink doesn't provide an
|
46
39
|
# explicit +departure_time+ so we use the +arrival_time+.
|
47
40
|
#
|
@@ -66,7 +59,7 @@ module Translink
|
|
66
59
|
# or the opposite (Goofy) direction.
|
67
60
|
|
68
61
|
# Creates a new trip.
|
69
|
-
#
|
62
|
+
#
|
70
63
|
# @param url [String] URL to fetch the page from.
|
71
64
|
# @param date [Date] Date the trip runs on.
|
72
65
|
def initialize url, date, direction
|
@@ -87,24 +80,25 @@ module Translink
|
|
87
80
|
#
|
88
81
|
# @return [String]
|
89
82
|
def trip_id
|
90
|
-
url.to_s =~ /information\/[a-z]+\/([^\/]+)/
|
91
|
-
$1
|
92
|
-
end
|
93
|
-
|
94
|
-
# Get the trip's service ID.
|
95
|
-
#
|
96
|
-
# @return [String]
|
97
|
-
def service_id
|
98
83
|
url.to_s =~ /information\/[a-z]+\/[^\/]+\/([^\/]+)/
|
99
84
|
$1
|
100
85
|
end
|
101
86
|
|
102
|
-
# Builds an array of stop times.
|
87
|
+
# Builds an unique array of stop times.
|
103
88
|
#
|
104
89
|
# @return [Array<Page::Trip::StopTime>]
|
105
90
|
def stop_times
|
106
|
-
page.search('table#trip-details tbody tr').
|
107
|
-
StopTime.new
|
91
|
+
page.search('table#trip-details tbody tr').reduce(Array.new) do |stop_times, table_row|
|
92
|
+
stop_time = StopTime.new.html! table_row
|
93
|
+
duplicate = stop_times.find do |duplicate|
|
94
|
+
duplicate.stop_page.stop_id == stop_time.stop_page.stop_id &&
|
95
|
+
duplicate.arrival_time == stop_time.arrival_time
|
96
|
+
end
|
97
|
+
stop_times << stop_time unless duplicate
|
98
|
+
stop_times
|
99
|
+
end.each_with_index.map do |stop_time, index|
|
100
|
+
stop_time.stop_sequence = index
|
101
|
+
stop_time
|
108
102
|
end
|
109
103
|
end
|
110
104
|
end
|