translink 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/CHANGELOG.md +12 -0
  2. data/README.md +44 -8
  3. data/doc/schema.graffle +396 -82
  4. data/doc/schema.png +0 -0
  5. data/lib/translink/cli.rb +10 -16
  6. data/lib/translink/crawler.rb +5 -6
  7. data/lib/translink/db.rb +6 -14
  8. data/lib/translink/model/route.rb +29 -18
  9. data/lib/translink/model/stop.rb +17 -19
  10. data/lib/translink/model/stop_time.rb +26 -0
  11. data/lib/translink/model/trip.rb +48 -0
  12. data/lib/translink/page/route.rb +42 -18
  13. data/lib/translink/page/timetable.rb +15 -18
  14. data/lib/translink/page/trip.rb +90 -17
  15. data/lib/translink/page.rb +1 -1
  16. data/lib/translink/version.rb +1 -1
  17. data/lib/translink.rb +2 -3
  18. data/test/fixtures/sample/route.html +401 -1049
  19. data/test/fixtures/sample/timetable.html +170 -216
  20. data/test/fixtures/verbatim/route.html +1976 -7178
  21. data/test/fixtures/verbatim/timetable.html +1501 -6165
  22. data/test/fixtures/verbatim/trip.html +311 -508
  23. data/test/unit/cli_test.rb +4 -20
  24. data/test/unit/crawler_test.rb +16 -36
  25. data/test/unit/model/route_test.rb +14 -25
  26. data/test/unit/model/stop_test.rb +6 -31
  27. data/test/unit/model/stop_time_test.rb +11 -0
  28. data/test/unit/model/trip_test.rb +28 -0
  29. data/test/unit/page/route_test.rb +38 -28
  30. data/test/unit/page/timetable_test.rb +12 -10
  31. data/test/unit/page/trip_test.rb +38 -22
  32. data/test/unit/page_test.rb +1 -1
  33. data/translink.gemspec +2 -2
  34. metadata +24 -27
  35. data/lib/translink/code.rb +0 -9
  36. data/lib/translink/model/service.rb +0 -20
  37. data/lib/translink/model/stop/extractor.rb +0 -67
  38. data/test/unit/code_test.rb +0 -12
  39. data/test/unit/model/service_test.rb +0 -23
  40. data/test/unit/model/stop/extractor_test.rb +0 -112
data/doc/schema.png CHANGED
Binary file
data/lib/translink/cli.rb CHANGED
@@ -1,12 +1,11 @@
1
1
  module Translink
2
2
  class CLI
3
- RUNNABLE = ['extract', 'help', 'scrape']
3
+ RUNNABLE = ['help', 'scrape']
4
4
 
5
- attr_accessor :out, :pwd, :__crawler__, :__stop__
5
+ attr_accessor :out, :pwd, :__crawler__
6
6
 
7
7
  def initialize pwd
8
8
  self.__crawler__ = Translink::Crawler
9
- self.__stop__ = Model::Stop
10
9
  self.out = $stdout
11
10
  self.pwd = pwd
12
11
  end
@@ -22,26 +21,21 @@ module Translink
22
21
 
23
22
  protected
24
23
 
25
- def extract input
26
- return help nil unless input =~ /[A-Za-z]:\/\/.+/
27
- DB.new input do
28
- __stop__.all.each do |stop|
29
- stop.extract!
30
- stop.save!
31
- end
32
- end
33
- end
34
-
35
24
  def help input
36
- log 'help'
25
+ tomorrow = Date.today + 1
26
+ log 'Usage: translink scrape <DATE> [URI]'
27
+ log ''
28
+ log 'Examples:'
29
+ log " translink scrape #{tomorrow}"
30
+ log " translink scrape #{tomorrow} sqlite://~/Desktop/#{tomorrow}.sqlite3"
37
31
  end
38
32
 
39
33
  def scrape input
40
34
  return help nil unless input =~ /^(\d{4}-\d{2}-\d{2})(\s+--uri="?(.+)"?)?$/
41
35
  date = Date.parse $1
42
36
  uri = $3 || 'sqlite://' + File.join(pwd, "#{date}.sqlite3")
43
- DB.new uri do
44
- crawler = __crawler__.new 'http://jp.translink.com.au/travel-information/services-and-timetables/buses/all-bus-timetables'
37
+ DB.context uri, :migrate => true do
38
+ crawler = __crawler__.new 'http://jp.translink.com.au/travel-information/network-information/buses/all-timetables'
45
39
  crawler.crawl date
46
40
  end
47
41
  end
@@ -1,19 +1,18 @@
1
1
  module Translink
2
2
  class Crawler
3
- attr_accessor :__model__
4
- attr_reader :url
3
+ attr_reader :url
5
4
 
6
5
  def initialize url
7
- @__model__ = Model::Route
8
- @url = URI.parse url
6
+ @url = URI.parse url
9
7
  end
10
8
 
11
9
  def crawl date
12
10
  timetable_page = Page::Timetable.new(url.to_s).timetable_page date
13
11
  timetable_page.route_pages.each do |route_page|
14
- model = __model__.find_or_add_from_route_page route_page
12
+ route_model = Model::Route.find_or_add_route_from_route_page route_page
15
13
  route_page.trip_pages.each do |trip_page|
16
- model.add_service_from_trip_page trip_page
14
+ trip_model = route_model.add_trip_from_trip_page trip_page
15
+ trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
17
16
  end
18
17
  end
19
18
  end
data/lib/translink/db.rb CHANGED
@@ -1,20 +1,12 @@
1
1
  module Translink
2
- class DB
3
- attr_reader :name, :uri
4
-
5
- def initialize uri, &block
6
- @uri = uri
7
- @name = :default
8
- DataMapper.setup name, uri
9
- DataMapper.repository name do
2
+ module DB
3
+ def self.context uri, options = {}
4
+ DataMapper.setup :default, uri
5
+ DataMapper.repository :default do
10
6
  DataMapper.finalize
11
- DataMapper.auto_migrate!
7
+ DataMapper.auto_migrate! if options[:migrate]
8
+ yield if block_given?
12
9
  end
13
- use &block if block
14
- end
15
-
16
- def use &block
17
- DataMapper.repository(name) { block.call }
18
10
  end
19
11
  end
20
12
  end
@@ -2,26 +2,37 @@ module Translink
2
2
  module Model
3
3
  class Route
4
4
  include DataMapper::Resource
5
-
6
- property :id, Serial
7
- property :code, String
8
- property :name, String
9
- property :translink_id, Integer
10
-
11
- has n, :services
12
- has n, :stops, :through => :services
13
-
14
- def add_service_from_trip_page trip_page
15
- trip_page.trips.each do |trip|
16
- services << Service.build_from_trip(trip)
17
- services.last.save
18
- end
5
+
6
+ storage_names[:default] = 'routes'
7
+
8
+ property :id, Serial
9
+ property :short_name, String # Route code. Eg "130".
10
+ property :long_name, String # Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
11
+ property :route_type, Integer # Type of transporation. Eg "Bus".
12
+
13
+ has n, :trips
14
+
15
+ # Route model for the given +route_page+. Will create the route if it
16
+ # doesn't exist.
17
+ #
18
+ # @param route_pate [Page::Route] HTML page that represents the route.
19
+ # @return [Model::Route] DataMapper record.
20
+ def self.find_or_add_route_from_route_page route_page
21
+ first_or_create :short_name => route_page.short_name,
22
+ :long_name => route_page.long_name,
23
+ :route_type => route_page.route_type
19
24
  end
20
25
 
21
- def self.find_or_add_from_route_page route_page
22
- first_or_create :code => route_page.code,
23
- :name => route_page.name,
24
- :translink_id => route_page.translink_id
26
+ # Create a trip.
27
+ #
28
+ # @param trip_page [Page::Trip] HTML page that represents the trip.
29
+ # @return [Model::Trip] DataMapper record.
30
+ def add_trip_from_trip_page trip_page
31
+ Trip.new.tap do |trip|
32
+ trip.route = self
33
+ trip.trip_page! trip_page
34
+ trip.save
35
+ end
25
36
  end
26
37
  end
27
38
  end
@@ -3,28 +3,26 @@ module Translink
3
3
  class Stop
4
4
  include DataMapper::Resource
5
5
 
6
- property :id, Serial
7
- property :name, String
8
- property :summary, String
9
- property :street1, String
10
- property :street2, String
11
- property :locality, String
6
+ storage_names[:default] = 'stops'
12
7
 
13
- has n, :services
14
- has n, :routes, :through => :services
8
+ property :id, Serial
9
+ property :stop_id, String
10
+ property :stop_name, String
11
+ property :stop_lat, Float
12
+ property :stop_lon, Float
15
13
 
16
- attr_accessor :__extractor__
14
+ has n, :stop_times
17
15
 
18
- def self.find_or_add_from_stop stop
19
- Stop.first_or_create :name => stop.name, :summary => stop.summary
20
- end
21
-
22
- def extract!
23
- __extractor__.new(self).extract!
24
- end
25
-
26
- def __extractor__
27
- @__extractor__ ||= Extractor
16
+ # Stop model for the given +stop_page+. Will create the route if it
17
+ # doesn't exist.
18
+ #
19
+ # @param route_pate [Page::Route] HTML page representing the stop.
20
+ # @return [Model::Stop] DataMapper record.
21
+ def self.find_or_add_from_stop_page stop_page
22
+ first_or_create :stop_id => stop_page.stop_id,
23
+ :stop_name => stop_page.stop_name,
24
+ :stop_lat => stop_page.stop_lat,
25
+ :stop_lon => stop_page.stop_lon
28
26
  end
29
27
  end
30
28
  end
@@ -0,0 +1,26 @@
1
+ module Translink
2
+ module Model
3
+ class StopTime
4
+ include DataMapper::Resource
5
+
6
+ storage_names[:default] = 'stop_times'
7
+
8
+ property :id, Serial
9
+ property :arrival_time, String
10
+ property :stop_sequence, Integer
11
+
12
+ belongs_to :stop
13
+ belongs_to :trip
14
+
15
+ # Sets attributes from the +stop_time_page+.
16
+ #
17
+ # @param stop_time_page [Page::Stop::StopTime]
18
+ # @return [void]
19
+ def stop_time_page! stop_time_page
20
+ self.arrival_time = stop_time_page.arrival_time
21
+ self.stop = Stop.find_or_add_from_stop_page stop_time_page.stop_page
22
+ self.stop_sequence = stop_time_page.stop_sequence
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,48 @@
1
+ module Translink
2
+ module Model
3
+ class Trip
4
+ include DataMapper::Resource
5
+
6
+ storage_names[:default] = 'trips'
7
+
8
+ property :id, Serial
9
+ property :direction, String
10
+ property :service_id, Integer # Service belongs to a trip. Assigned by Translink.
11
+ property :trip_id, Integer # Unique ID assigned by Translink.
12
+
13
+ belongs_to :route
14
+
15
+ has n, :stop_times
16
+
17
+ # Creates a +Model::StopTime+ record and associates it with this
18
+ # trip.
19
+ #
20
+ # @param stop_time_page [Page::Stop::StopTime] HTML page representing the
21
+ # stop-time.
22
+ # @return [void]
23
+ def add_stop_time_from_stop_time_page stop_time_page
24
+ StopTime.new.tap do |stop_time|
25
+ stop_time.trip = self
26
+ stop_time.stop_time_page! stop_time_page
27
+ stop_time.save
28
+ end
29
+ end
30
+
31
+ def add_stop_times_from_stop_time_pages stop_time_pages
32
+ stop_time_pages.map do |stop_time_page|
33
+ add_stop_time_from_stop_time_page stop_time_page
34
+ end
35
+ end
36
+
37
+ # Sets properties from the given +trip_page+.
38
+ #
39
+ # @param trip_page [Trip::Page] HTML page that represents the trip.
40
+ # @return [void]
41
+ def trip_page! trip_page
42
+ self.direction = trip_page.direction
43
+ self.service_id = trip_page.service_id
44
+ self.trip_id = trip_page.trip_id
45
+ end
46
+ end
47
+ end
48
+ end
@@ -1,29 +1,53 @@
1
1
  module Translink
2
2
  class Page::Route < Page
3
- def code
4
- page.search('div#contentleftCol table th:nth-child(2)').first.text.strip
3
+ class UnknownRouteTypeError < StandardError
5
4
  end
6
-
7
- def date
8
- Date.parse page.search('div#contentleftCol div.content p span').text
9
- end
10
-
11
- def direction
12
- page.search('div#contentleftCol div.content p').text.match(/(\S+)$/).captures.first
5
+
6
+ ROUTE_TYPES = {'buses' => 3, 'ferries' => 4, 'trains' => 0} # Maps to Google Transit route type.
7
+
8
+ attr_reader :long_name # [String] Usually a list of suburbs.
9
+
10
+ # Creates a new route.
11
+ #
12
+ # @param url [String] URL to fetch the page from.
13
+ # @param name [String] Route's long name.
14
+ def initialize url, long_name
15
+ super url
16
+ @long_name = long_name
13
17
  end
14
-
15
- def name
16
- page.search('div#headingBar h1').text
18
+
19
+ # Gets the route's code.
20
+ #
21
+ # @return [String]
22
+ def short_name
23
+ page.search('div#headingBar h1').first.text.sub('Route ', '')
17
24
  end
18
-
19
- def translink_id
20
- url.path.match(/([^\/]+)$/).captures.last
25
+
26
+ # Get the date this route is running. Trip pages are bound by this
27
+ # date.
28
+ #
29
+ # @return [DateTime]
30
+ def date
31
+ DateTime.parse page.search('select#TimetableDate option[selected]').first['value']
21
32
  end
22
-
33
+
34
+ # Builds an array of trip pages.
35
+ #
36
+ # @return [Array<Page::Trip>]
23
37
  def trip_pages
24
- page.search('table:not(:last-child) tfoot a').map do |anchor|
25
- Trip.new url_from_href(anchor[:href])
38
+ page.search('a.map-link-top').map do |anchor|
39
+ Trip.new url_from_href(anchor[:href]), date
26
40
  end
27
41
  end
42
+
43
+ # Get the type of transportation used on the route.
44
+ #
45
+ # @return [Integer]
46
+ # @raise [UnknownRouteTypeError] if the route type is not defined in
47
+ # +ROUTE_TYPES+.
48
+ def route_type
49
+ url.to_s =~ /(buses|ferries|trains)/
50
+ ROUTE_TYPES.fetch($1) { raise UnknownRouteTypeError }
51
+ end
28
52
  end
29
53
  end
@@ -1,31 +1,28 @@
1
1
  module Translink
2
2
  class Page::Timetable < Page
3
- attr_accessor :code_class
4
-
5
- def code_class
6
- @code_class ||= Code
7
- end
8
-
3
+ # Builds an array of route pages.
4
+ #
5
+ # @return [Array<Page::Route>]
9
6
  def route_pages
10
- page.search('table tr td:first-child a').reduce Array.new do |pages, anchor|
11
- route = Route.new url_from_href anchor['href']
12
- pages << route if code_class.brisbane? extract_code_from_anchor(anchor)
7
+ page.search('table tr td:last-child a').reduce Array.new do |pages, anchor|
8
+ route = Route.new url_from_href(anchor['href']), anchor.text
9
+ pages << route
13
10
  pages
14
11
  end
15
12
  end
16
13
 
17
- def timetable_page date
18
- form = page.forms[1]
19
- form.field_with(:name => 'TimetableDate').value = date.to_s
14
+ # Returns a timetable page with routes running on the given date.
15
+ #
16
+ # @param timestamp [DateTime] Filter by this date.
17
+ # TZ should be Australia/Brisbane and the time should be at midnight.
18
+ # @return [Timetable]
19
+ def timetable_page timestamp
20
+ form = page.forms[1]
21
+ value = timestamp.strftime('%-d/%m/%y %I:%M:%S %p') # Eg: "4/06/2012 12:00:00 AM"
22
+ form.field_with(:name => 'Date').value = value
20
23
  self.class.new(url_from_href(form.action)).tap do |page|
21
24
  page.page = form.submit
22
25
  end
23
26
  end
24
-
25
- protected
26
-
27
- def extract_code_from_anchor anchor
28
- anchor.text.gsub(/\-\s(Inbound|Outbound)/, '').strip
29
- end
30
27
  end
31
28
  end
@@ -1,35 +1,108 @@
1
1
  module Translink
2
2
  class Page::Trip < Page
3
- Trip = Struct.new :stop, :time
4
- Stop = Struct.new :name, :summary
3
+ class Stop
4
+ attr_accessor :stop_id # [String] Unique ID.
5
+ attr_accessor :stop_name # [String] Eg: "Queen Street station, platform A6".
6
+ attr_accessor :stop_lat # [String] Eg: "27.470677".
7
+ attr_accessor :stop_lon # [String] Eg: "153.024747".
5
8
 
6
- def date
7
- Date.parse page.search('div#contentleftCol p span').text
9
+ # Tests equality with +other+. Considered equal if +stop_id+ and
10
+ # +stop_name+ are equal.
11
+ #
12
+ # @param other [Page::Trip::Stop] Stop to compare.
13
+ # @return [TrueClass, FalseClass]
14
+ def == other
15
+ stop_id == other.stop_id &&
16
+ stop_name == other.stop_name
17
+ end
18
+
19
+ # Sets attributes by extracting attributes from the HTML fragment.
20
+ #
21
+ # @param node_set [Nokogiri::XML::NodeSet] The HTML fragment to search.
22
+ # @return [Page::Trip::Stop]
23
+ def html! node_set
24
+ anchor = node_set.search('td a').first
25
+ anchor['href'] =~ /([^\/]+)$/
26
+ @stop_id = $1
27
+ @stop_name = anchor.text
28
+ @stop_lat, @stop_lon = node_set['data-position'].split(',')
29
+ self
30
+ end
8
31
  end
9
32
 
10
- def stops
11
- table_rows.search('td:first-child').map do |td|
12
- attributes = td.text.strip.split "\n"
13
- Stop.new *attributes
33
+ class StopTime
34
+ attr_accessor :arrival_time # [String] The time the vehicle arrives at the +stop+.
35
+ attr_accessor :stop_page # [Page::Trip::Stop] Stop associated with the +arrival_time+.
36
+ attr_accessor :stop_sequence # [Integer] Order in which this stop is visited in the trip.
37
+
38
+ # Creates a new stop time.
39
+ #
40
+ # @param stop_sequence [Integer] Order in which this stop is visited.
41
+ def initialize stop_sequence
42
+ @stop_sequence = stop_sequence
43
+ end
44
+
45
+ # Time vehicle starts from the +stop+. Translink doesn't provide an
46
+ # explicit +departure_time+ so we use the +arrival_time+.
47
+ #
48
+ # @return [String] Eg: "10:00 A.M."
49
+ def departure_time
50
+ arrival_time
51
+ end
52
+
53
+ # Sets attributes by extracting attributes from the HTML fragment.
54
+ #
55
+ # @param node_set [Nokogiri::XML::NodeSet] The HTML fragment to search.
56
+ # @return [Page::Trip::Stop]
57
+ def html! node_set
58
+ @stop_page = Stop.new.html! node_set
59
+ @arrival_time = node_set.search('td').first.text.sub('.', ':').sub(/(a|p)(m)$/, ' \1.M.').upcase # "12:25pm" -> "12:25 P.M"
60
+ self
14
61
  end
15
62
  end
16
63
 
17
- def times
18
- table_rows.search('td:last-child').map { |td| date_time td.text.strip }
64
+ attr_accessor :date # [Date] Date the trip runs on.
65
+
66
+ # Creates a new trip.
67
+ #
68
+ # @param url [String] URL to fetch the page from.
69
+ # @param date [Date] Date the trip runs on.
70
+ def initialize url, date
71
+ super url
72
+ @date = date.to_date
19
73
  end
20
74
 
21
- def trips
22
- stops.zip(times).map { |attributes| Trip.new *attributes }
75
+ # Get the trip's direction of travel.
76
+ #
77
+ # @return [String] "inbound" or "outbound".
78
+ def direction
79
+ url.to_s =~ /information\/([a-z]+)\//
80
+ $1
23
81
  end
24
82
 
25
- protected
83
+ # Get the trip's unique ID.
84
+ #
85
+ # @return [String]
86
+ def trip_id
87
+ url.to_s =~ /information\/[a-z]+\/([^\/]+)/
88
+ $1
89
+ end
26
90
 
27
- def date_time time_string
28
- DateTime.parse "#{date} #{time_string.sub('.', ':')} +1000"
91
+ # Get the trip's service ID.
92
+ #
93
+ # @return [String]
94
+ def service_id
95
+ url.to_s =~ /information\/[a-z]+\/[^\/]+\/([^\/]+)/
96
+ $1
29
97
  end
30
98
 
31
- def table_rows
32
- @table_rows ||= page.search 'tbody tr'
99
+ # Builds an array of stop times.
100
+ #
101
+ # @return [Array<Page::Trip::StopTime>]
102
+ def stop_times
103
+ page.search('table#trip-details tbody tr').each_with_index.map do |table_row, index|
104
+ StopTime.new(index).html! table_row
105
+ end
33
106
  end
34
107
  end
35
108
  end
@@ -1,6 +1,6 @@
1
1
  module Translink
2
2
  class Page
3
- USER_AGENT = "Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink)"
3
+ USER_AGENT = "Mozilla/5.0 (Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink))"
4
4
 
5
5
  attr_accessor :agent, :page, :url
6
6
 
@@ -1,3 +1,3 @@
1
1
  module Translink
2
- VERSION = '0.0.1'
2
+ VERSION = '1.0.0'
3
3
  end
data/lib/translink.rb CHANGED
@@ -5,13 +5,12 @@ require 'uri'
5
5
 
6
6
  require 'translink/version'
7
7
  require 'translink/cli'
8
- require 'translink/code'
9
8
  require 'translink/crawler'
10
9
  require 'translink/db'
11
10
  require 'translink/model/route'
12
- require 'translink/model/service'
13
11
  require 'translink/model/stop'
14
- require 'translink/model/stop/extractor'
12
+ require 'translink/model/stop_time'
13
+ require 'translink/model/trip'
15
14
  require 'translink/page'
16
15
  require 'translink/page/route'
17
16
  require 'translink/page/timetable'