translink 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/CHANGELOG.md +12 -0
  2. data/README.md +44 -8
  3. data/doc/schema.graffle +396 -82
  4. data/doc/schema.png +0 -0
  5. data/lib/translink/cli.rb +10 -16
  6. data/lib/translink/crawler.rb +5 -6
  7. data/lib/translink/db.rb +6 -14
  8. data/lib/translink/model/route.rb +29 -18
  9. data/lib/translink/model/stop.rb +17 -19
  10. data/lib/translink/model/stop_time.rb +26 -0
  11. data/lib/translink/model/trip.rb +48 -0
  12. data/lib/translink/page/route.rb +42 -18
  13. data/lib/translink/page/timetable.rb +15 -18
  14. data/lib/translink/page/trip.rb +90 -17
  15. data/lib/translink/page.rb +1 -1
  16. data/lib/translink/version.rb +1 -1
  17. data/lib/translink.rb +2 -3
  18. data/test/fixtures/sample/route.html +401 -1049
  19. data/test/fixtures/sample/timetable.html +170 -216
  20. data/test/fixtures/verbatim/route.html +1976 -7178
  21. data/test/fixtures/verbatim/timetable.html +1501 -6165
  22. data/test/fixtures/verbatim/trip.html +311 -508
  23. data/test/unit/cli_test.rb +4 -20
  24. data/test/unit/crawler_test.rb +16 -36
  25. data/test/unit/model/route_test.rb +14 -25
  26. data/test/unit/model/stop_test.rb +6 -31
  27. data/test/unit/model/stop_time_test.rb +11 -0
  28. data/test/unit/model/trip_test.rb +28 -0
  29. data/test/unit/page/route_test.rb +38 -28
  30. data/test/unit/page/timetable_test.rb +12 -10
  31. data/test/unit/page/trip_test.rb +38 -22
  32. data/test/unit/page_test.rb +1 -1
  33. data/translink.gemspec +2 -2
  34. metadata +24 -27
  35. data/lib/translink/code.rb +0 -9
  36. data/lib/translink/model/service.rb +0 -20
  37. data/lib/translink/model/stop/extractor.rb +0 -67
  38. data/test/unit/code_test.rb +0 -12
  39. data/test/unit/model/service_test.rb +0 -23
  40. data/test/unit/model/stop/extractor_test.rb +0 -112
data/doc/schema.png CHANGED
Binary file
data/lib/translink/cli.rb CHANGED
@@ -1,12 +1,11 @@
1
1
  module Translink
2
2
  class CLI
3
- RUNNABLE = ['extract', 'help', 'scrape']
3
+ RUNNABLE = ['help', 'scrape']
4
4
 
5
- attr_accessor :out, :pwd, :__crawler__, :__stop__
5
+ attr_accessor :out, :pwd, :__crawler__
6
6
 
7
7
  def initialize pwd
8
8
  self.__crawler__ = Translink::Crawler
9
- self.__stop__ = Model::Stop
10
9
  self.out = $stdout
11
10
  self.pwd = pwd
12
11
  end
@@ -22,26 +21,21 @@ module Translink
22
21
 
23
22
  protected
24
23
 
25
- def extract input
26
- return help nil unless input =~ /[A-Za-z]:\/\/.+/
27
- DB.new input do
28
- __stop__.all.each do |stop|
29
- stop.extract!
30
- stop.save!
31
- end
32
- end
33
- end
34
-
35
24
  def help input
36
- log 'help'
25
+ tomorrow = Date.today + 1
26
+ log 'Usage: translink scrape <DATE> [URI]'
27
+ log ''
28
+ log 'Examples:'
29
+ log " translink scrape #{tomorrow}"
30
+ log " translink scrape #{tomorrow} sqlite://~/Desktop/#{tomorrow}.sqlite3"
37
31
  end
38
32
 
39
33
  def scrape input
40
34
  return help nil unless input =~ /^(\d{4}-\d{2}-\d{2})(\s+--uri="?(.+)"?)?$/
41
35
  date = Date.parse $1
42
36
  uri = $3 || 'sqlite://' + File.join(pwd, "#{date}.sqlite3")
43
- DB.new uri do
44
- crawler = __crawler__.new 'http://jp.translink.com.au/travel-information/services-and-timetables/buses/all-bus-timetables'
37
+ DB.context uri, :migrate => true do
38
+ crawler = __crawler__.new 'http://jp.translink.com.au/travel-information/network-information/buses/all-timetables'
45
39
  crawler.crawl date
46
40
  end
47
41
  end
@@ -1,19 +1,18 @@
1
1
  module Translink
2
2
  class Crawler
3
- attr_accessor :__model__
4
- attr_reader :url
3
+ attr_reader :url
5
4
 
6
5
  def initialize url
7
- @__model__ = Model::Route
8
- @url = URI.parse url
6
+ @url = URI.parse url
9
7
  end
10
8
 
11
9
  def crawl date
12
10
  timetable_page = Page::Timetable.new(url.to_s).timetable_page date
13
11
  timetable_page.route_pages.each do |route_page|
14
- model = __model__.find_or_add_from_route_page route_page
12
+ route_model = Model::Route.find_or_add_route_from_route_page route_page
15
13
  route_page.trip_pages.each do |trip_page|
16
- model.add_service_from_trip_page trip_page
14
+ trip_model = route_model.add_trip_from_trip_page trip_page
15
+ trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
17
16
  end
18
17
  end
19
18
  end
data/lib/translink/db.rb CHANGED
@@ -1,20 +1,12 @@
1
1
  module Translink
2
- class DB
3
- attr_reader :name, :uri
4
-
5
- def initialize uri, &block
6
- @uri = uri
7
- @name = :default
8
- DataMapper.setup name, uri
9
- DataMapper.repository name do
2
+ module DB
3
+ def self.context uri, options = {}
4
+ DataMapper.setup :default, uri
5
+ DataMapper.repository :default do
10
6
  DataMapper.finalize
11
- DataMapper.auto_migrate!
7
+ DataMapper.auto_migrate! if options[:migrate]
8
+ yield if block_given?
12
9
  end
13
- use &block if block
14
- end
15
-
16
- def use &block
17
- DataMapper.repository(name) { block.call }
18
10
  end
19
11
  end
20
12
  end
@@ -2,26 +2,37 @@ module Translink
2
2
  module Model
3
3
  class Route
4
4
  include DataMapper::Resource
5
-
6
- property :id, Serial
7
- property :code, String
8
- property :name, String
9
- property :translink_id, Integer
10
-
11
- has n, :services
12
- has n, :stops, :through => :services
13
-
14
- def add_service_from_trip_page trip_page
15
- trip_page.trips.each do |trip|
16
- services << Service.build_from_trip(trip)
17
- services.last.save
18
- end
5
+
6
+ storage_names[:default] = 'routes'
7
+
8
+ property :id, Serial
9
+ property :short_name, String # Route code. Eg "130".
10
+ property :long_name, String # Suburbs serviced or destination. Eg "City, Sunnybank, Algester".
11
+ property :route_type, Integer # Type of transporation. Eg "Bus".
12
+
13
+ has n, :trips
14
+
15
+ # Route model for the given +route_page+. Will create the route if it
16
+ # doesn't exist.
17
+ #
18
+ # @param route_pate [Page::Route] HTML page that represents the route.
19
+ # @return [Model::Route] DataMapper record.
20
+ def self.find_or_add_route_from_route_page route_page
21
+ first_or_create :short_name => route_page.short_name,
22
+ :long_name => route_page.long_name,
23
+ :route_type => route_page.route_type
19
24
  end
20
25
 
21
- def self.find_or_add_from_route_page route_page
22
- first_or_create :code => route_page.code,
23
- :name => route_page.name,
24
- :translink_id => route_page.translink_id
26
+ # Create a trip.
27
+ #
28
+ # @param trip_page [Page::Trip] HTML page that represents the trip.
29
+ # @return [Model::Trip] DataMapper record.
30
+ def add_trip_from_trip_page trip_page
31
+ Trip.new.tap do |trip|
32
+ trip.route = self
33
+ trip.trip_page! trip_page
34
+ trip.save
35
+ end
25
36
  end
26
37
  end
27
38
  end
@@ -3,28 +3,26 @@ module Translink
3
3
  class Stop
4
4
  include DataMapper::Resource
5
5
 
6
- property :id, Serial
7
- property :name, String
8
- property :summary, String
9
- property :street1, String
10
- property :street2, String
11
- property :locality, String
6
+ storage_names[:default] = 'stops'
12
7
 
13
- has n, :services
14
- has n, :routes, :through => :services
8
+ property :id, Serial
9
+ property :stop_id, String
10
+ property :stop_name, String
11
+ property :stop_lat, Float
12
+ property :stop_lon, Float
15
13
 
16
- attr_accessor :__extractor__
14
+ has n, :stop_times
17
15
 
18
- def self.find_or_add_from_stop stop
19
- Stop.first_or_create :name => stop.name, :summary => stop.summary
20
- end
21
-
22
- def extract!
23
- __extractor__.new(self).extract!
24
- end
25
-
26
- def __extractor__
27
- @__extractor__ ||= Extractor
16
+ # Stop model for the given +stop_page+. Will create the route if it
17
+ # doesn't exist.
18
+ #
19
+ # @param route_pate [Page::Route] HTML page representing the stop.
20
+ # @return [Model::Stop] DataMapper record.
21
+ def self.find_or_add_from_stop_page stop_page
22
+ first_or_create :stop_id => stop_page.stop_id,
23
+ :stop_name => stop_page.stop_name,
24
+ :stop_lat => stop_page.stop_lat,
25
+ :stop_lon => stop_page.stop_lon
28
26
  end
29
27
  end
30
28
  end
@@ -0,0 +1,26 @@
1
+ module Translink
2
+ module Model
3
+ class StopTime
4
+ include DataMapper::Resource
5
+
6
+ storage_names[:default] = 'stop_times'
7
+
8
+ property :id, Serial
9
+ property :arrival_time, String
10
+ property :stop_sequence, Integer
11
+
12
+ belongs_to :stop
13
+ belongs_to :trip
14
+
15
+ # Sets attributes from the +stop_time_page+.
16
+ #
17
+ # @param stop_time_page [Page::Stop::StopTime]
18
+ # @return [void]
19
+ def stop_time_page! stop_time_page
20
+ self.arrival_time = stop_time_page.arrival_time
21
+ self.stop = Stop.find_or_add_from_stop_page stop_time_page.stop_page
22
+ self.stop_sequence = stop_time_page.stop_sequence
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,48 @@
1
+ module Translink
2
+ module Model
3
+ class Trip
4
+ include DataMapper::Resource
5
+
6
+ storage_names[:default] = 'trips'
7
+
8
+ property :id, Serial
9
+ property :direction, String
10
+ property :service_id, Integer # Service belongs to a trip. Assigned by Translink.
11
+ property :trip_id, Integer # Unique ID assigned by Translink.
12
+
13
+ belongs_to :route
14
+
15
+ has n, :stop_times
16
+
17
+ # Creates a +Model::StopTime+ record and associates it with this
18
+ # trip.
19
+ #
20
+ # @param stop_time_page [Page::Stop::StopTime] HTML page representing the
21
+ # stop-time.
22
+ # @return [void]
23
+ def add_stop_time_from_stop_time_page stop_time_page
24
+ StopTime.new.tap do |stop_time|
25
+ stop_time.trip = self
26
+ stop_time.stop_time_page! stop_time_page
27
+ stop_time.save
28
+ end
29
+ end
30
+
31
+ def add_stop_times_from_stop_time_pages stop_time_pages
32
+ stop_time_pages.map do |stop_time_page|
33
+ add_stop_time_from_stop_time_page stop_time_page
34
+ end
35
+ end
36
+
37
+ # Sets properties from the given +trip_page+.
38
+ #
39
+ # @param trip_page [Trip::Page] HTML page that represents the trip.
40
+ # @return [void]
41
+ def trip_page! trip_page
42
+ self.direction = trip_page.direction
43
+ self.service_id = trip_page.service_id
44
+ self.trip_id = trip_page.trip_id
45
+ end
46
+ end
47
+ end
48
+ end
@@ -1,29 +1,53 @@
1
1
  module Translink
2
2
  class Page::Route < Page
3
- def code
4
- page.search('div#contentleftCol table th:nth-child(2)').first.text.strip
3
+ class UnknownRouteTypeError < StandardError
5
4
  end
6
-
7
- def date
8
- Date.parse page.search('div#contentleftCol div.content p span').text
9
- end
10
-
11
- def direction
12
- page.search('div#contentleftCol div.content p').text.match(/(\S+)$/).captures.first
5
+
6
+ ROUTE_TYPES = {'buses' => 3, 'ferries' => 4, 'trains' => 0} # Maps to Google Transit route type.
7
+
8
+ attr_reader :long_name # [String] Usually a list of suburbs.
9
+
10
+ # Creates a new route.
11
+ #
12
+ # @param url [String] URL to fetch the page from.
13
+ # @param name [String] Route's long name.
14
+ def initialize url, long_name
15
+ super url
16
+ @long_name = long_name
13
17
  end
14
-
15
- def name
16
- page.search('div#headingBar h1').text
18
+
19
+ # Gets the route's code.
20
+ #
21
+ # @return [String]
22
+ def short_name
23
+ page.search('div#headingBar h1').first.text.sub('Route ', '')
17
24
  end
18
-
19
- def translink_id
20
- url.path.match(/([^\/]+)$/).captures.last
25
+
26
+ # Get the date this route is running. Trip pages are bound by this
27
+ # date.
28
+ #
29
+ # @return [DateTime]
30
+ def date
31
+ DateTime.parse page.search('select#TimetableDate option[selected]').first['value']
21
32
  end
22
-
33
+
34
+ # Builds an array of trip pages.
35
+ #
36
+ # @return [Array<Page::Trip>]
23
37
  def trip_pages
24
- page.search('table:not(:last-child) tfoot a').map do |anchor|
25
- Trip.new url_from_href(anchor[:href])
38
+ page.search('a.map-link-top').map do |anchor|
39
+ Trip.new url_from_href(anchor[:href]), date
26
40
  end
27
41
  end
42
+
43
+ # Get the type of transportation used on the route.
44
+ #
45
+ # @return [Integer]
46
+ # @raise [UnknownRouteTypeError] if the route type is not defined in
47
+ # +ROUTE_TYPES+.
48
+ def route_type
49
+ url.to_s =~ /(buses|ferries|trains)/
50
+ ROUTE_TYPES.fetch($1) { raise UnknownRouteTypeError }
51
+ end
28
52
  end
29
53
  end
@@ -1,31 +1,28 @@
1
1
  module Translink
2
2
  class Page::Timetable < Page
3
- attr_accessor :code_class
4
-
5
- def code_class
6
- @code_class ||= Code
7
- end
8
-
3
+ # Builds an array of route pages.
4
+ #
5
+ # @return [Array<Page::Route>]
9
6
  def route_pages
10
- page.search('table tr td:first-child a').reduce Array.new do |pages, anchor|
11
- route = Route.new url_from_href anchor['href']
12
- pages << route if code_class.brisbane? extract_code_from_anchor(anchor)
7
+ page.search('table tr td:last-child a').reduce Array.new do |pages, anchor|
8
+ route = Route.new url_from_href(anchor['href']), anchor.text
9
+ pages << route
13
10
  pages
14
11
  end
15
12
  end
16
13
 
17
- def timetable_page date
18
- form = page.forms[1]
19
- form.field_with(:name => 'TimetableDate').value = date.to_s
14
+ # Returns a timetable page with routes running on the given date.
15
+ #
16
+ # @param timestamp [DateTime] Filter by this date.
17
+ # TZ should be Australia/Brisbane and the time should be at midnight.
18
+ # @return [Timetable]
19
+ def timetable_page timestamp
20
+ form = page.forms[1]
21
+ value = timestamp.strftime('%-d/%m/%y %I:%M:%S %p') # Eg: "4/06/2012 12:00:00 AM"
22
+ form.field_with(:name => 'Date').value = value
20
23
  self.class.new(url_from_href(form.action)).tap do |page|
21
24
  page.page = form.submit
22
25
  end
23
26
  end
24
-
25
- protected
26
-
27
- def extract_code_from_anchor anchor
28
- anchor.text.gsub(/\-\s(Inbound|Outbound)/, '').strip
29
- end
30
27
  end
31
28
  end
@@ -1,35 +1,108 @@
1
1
  module Translink
2
2
  class Page::Trip < Page
3
- Trip = Struct.new :stop, :time
4
- Stop = Struct.new :name, :summary
3
+ class Stop
4
+ attr_accessor :stop_id # [String] Unique ID.
5
+ attr_accessor :stop_name # [String] Eg: "Queen Street station, platform A6".
6
+ attr_accessor :stop_lat # [String] Eg: "27.470677".
7
+ attr_accessor :stop_lon # [String] Eg: "153.024747".
5
8
 
6
- def date
7
- Date.parse page.search('div#contentleftCol p span').text
9
+ # Tests equality with +other+. Considered equal if +stop_id+ and
10
+ # +stop_name+ are equal.
11
+ #
12
+ # @param other [Page::Trip::Stop] Stop to compare.
13
+ # @return [TrueClass, FalseClass]
14
+ def == other
15
+ stop_id == other.stop_id &&
16
+ stop_name == other.stop_name
17
+ end
18
+
19
+ # Sets attributes by extracting attributes from the HTML fragment.
20
+ #
21
+ # @param node_set [Nokogiri::XML::NodeSet] The HTML fragment to search.
22
+ # @return [Page::Trip::Stop]
23
+ def html! node_set
24
+ anchor = node_set.search('td a').first
25
+ anchor['href'] =~ /([^\/]+)$/
26
+ @stop_id = $1
27
+ @stop_name = anchor.text
28
+ @stop_lat, @stop_lon = node_set['data-position'].split(',')
29
+ self
30
+ end
8
31
  end
9
32
 
10
- def stops
11
- table_rows.search('td:first-child').map do |td|
12
- attributes = td.text.strip.split "\n"
13
- Stop.new *attributes
33
+ class StopTime
34
+ attr_accessor :arrival_time # [String] The time the vehicle arrives at the +stop+.
35
+ attr_accessor :stop_page # [Page::Trip::Stop] Stop associated with the +arrival_time+.
36
+ attr_accessor :stop_sequence # [Integer] Order in which this stop is visited in the trip.
37
+
38
+ # Creates a new stop time.
39
+ #
40
+ # @param stop_sequence [Integer] Order in which this stop is visited.
41
+ def initialize stop_sequence
42
+ @stop_sequence = stop_sequence
43
+ end
44
+
45
+ # Time vehicle starts from the +stop+. Translink doesn't provide an
46
+ # explicit +departure_time+ so we use the +arrival_time+.
47
+ #
48
+ # @return [String] Eg: "10:00 A.M."
49
+ def departure_time
50
+ arrival_time
51
+ end
52
+
53
+ # Sets attributes by extracting attributes from the HTML fragment.
54
+ #
55
+ # @param node_set [Nokogiri::XML::NodeSet] The HTML fragment to search.
56
+ # @return [Page::Trip::Stop]
57
+ def html! node_set
58
+ @stop_page = Stop.new.html! node_set
59
+ @arrival_time = node_set.search('td').first.text.sub('.', ':').sub(/(a|p)(m)$/, ' \1.M.').upcase # "12:25pm" -> "12:25 P.M"
60
+ self
14
61
  end
15
62
  end
16
63
 
17
- def times
18
- table_rows.search('td:last-child').map { |td| date_time td.text.strip }
64
+ attr_accessor :date # [Date] Date the trip runs on.
65
+
66
+ # Creates a new trip.
67
+ #
68
+ # @param url [String] URL to fetch the page from.
69
+ # @param date [Date] Date the trip runs on.
70
+ def initialize url, date
71
+ super url
72
+ @date = date.to_date
19
73
  end
20
74
 
21
- def trips
22
- stops.zip(times).map { |attributes| Trip.new *attributes }
75
+ # Get the trip's direction of travel.
76
+ #
77
+ # @return [String] "inbound" or "outbound".
78
+ def direction
79
+ url.to_s =~ /information\/([a-z]+)\//
80
+ $1
23
81
  end
24
82
 
25
- protected
83
+ # Get the trip's unique ID.
84
+ #
85
+ # @return [String]
86
+ def trip_id
87
+ url.to_s =~ /information\/[a-z]+\/([^\/]+)/
88
+ $1
89
+ end
26
90
 
27
- def date_time time_string
28
- DateTime.parse "#{date} #{time_string.sub('.', ':')} +1000"
91
+ # Get the trip's service ID.
92
+ #
93
+ # @return [String]
94
+ def service_id
95
+ url.to_s =~ /information\/[a-z]+\/[^\/]+\/([^\/]+)/
96
+ $1
29
97
  end
30
98
 
31
- def table_rows
32
- @table_rows ||= page.search 'tbody tr'
99
+ # Builds an array of stop times.
100
+ #
101
+ # @return [Array<Page::Trip::StopTime>]
102
+ def stop_times
103
+ page.search('table#trip-details tbody tr').each_with_index.map do |table_row, index|
104
+ StopTime.new(index).html! table_row
105
+ end
33
106
  end
34
107
  end
35
108
  end
@@ -1,6 +1,6 @@
1
1
  module Translink
2
2
  class Page
3
- USER_AGENT = "Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink)"
3
+ USER_AGENT = "Mozilla/5.0 (Translink/#{VERSION} Ruby/#{RUBY_VERSION} (https://github.com/tatey/translink))"
4
4
 
5
5
  attr_accessor :agent, :page, :url
6
6
 
@@ -1,3 +1,3 @@
1
1
  module Translink
2
- VERSION = '0.0.1'
2
+ VERSION = '1.0.0'
3
3
  end
data/lib/translink.rb CHANGED
@@ -5,13 +5,12 @@ require 'uri'
5
5
 
6
6
  require 'translink/version'
7
7
  require 'translink/cli'
8
- require 'translink/code'
9
8
  require 'translink/crawler'
10
9
  require 'translink/db'
11
10
  require 'translink/model/route'
12
- require 'translink/model/service'
13
11
  require 'translink/model/stop'
14
- require 'translink/model/stop/extractor'
12
+ require 'translink/model/stop_time'
13
+ require 'translink/model/trip'
15
14
  require 'translink/page'
16
15
  require 'translink/page/route'
17
16
  require 'translink/page/timetable'