southeastern-daily-performance 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## Intro
2
+
3
+ Southeastern publish [Daily Performance reports](http://www.southeasternrailway.co.uk/your-journey/daily-performance/). They're not very usable as they are. This project currently contains the code I've used to parse that data and the parsed data itself.
4
+
5
+ I've also popped the data into [this Google Fusion table](http://www.google.com/fusiontables/DataSource?dsrcid=359310) and added a bit of a [query tool](http://chrisroos.github.com/southeastern-daily-performance/) and [dashboard](http://chrisroos.github.com/southeastern-daily-performance/dashboard.html) (of sorts).
6
+
1
7
  ## Installation
2
8
 
3
9
  $ gem install southeastern-daily-performance -r http://gemcutter.org
@@ -13,7 +19,26 @@
13
19
  $ curl "http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132" > sedpr.html
14
20
  $ sedpr-to-csv sedpr.html
15
21
 
16
-
17
22
  ### Implicitly download html and convert
18
23
 
19
- $ sedpr-to-csv http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132
24
+ $ sedpr-to-csv http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132
25
+
26
+ ## Notes
27
+
28
+ ### Combining all csv files into one big file
29
+
30
+ $ echo "Date,Problem,Scheduled departure time,Scheduled departure station,Scheduled arrival station,Affect on service" > combined.csv
31
+ $ cat /path/to/csv/files/*.csv >> combined.csv
32
+
33
+ ### Combining all csv overview files into one file
34
+
35
+ $ echo "Date,Services scheduled,Services run,Services within 5 minutes of schedule" > combined.overview.csv
36
+ $ cat /path/to/csv/files/*.overview.csv >> combined.overview.csv
37
+
38
+ ## TODO
39
+
40
+ * 2010-04-21 breaks the parser...
41
+ * Ensure that I'm generating valid CSV (specifically that I'm quoting columns that contain commas)
42
+ * I currently have an empty file for 2010-11-30.csv. Investigate the cause.
43
+ * Republish the gem
44
+ * I'm getting 0s for the parsed overview from 2010-11-30. Investigate the cause.
data/Rakefile CHANGED
@@ -7,19 +7,6 @@ Rake::TestTask.new do |t|
7
7
  t.verbose = true
8
8
  end
9
9
 
10
- require File.join(File.dirname(__FILE__), 'lib', 'sedpr')
11
- task 'convert' do
12
- if data_dir = ENV['DATA_DIR']
13
- Dir[File.join(data_dir, '*.html')].each do |html_file|
14
- html = File.read(html_file)
15
- puts DailyPerformanceReport.new(html).to_csv
16
- end
17
- else
18
- puts "Usage: DATA_DIR=/path/to/html-reports rake convert"
19
- exit 1
20
- end
21
- end
22
-
23
10
  require "rubygems"
24
11
  require "rake/gempackagetask"
25
12
  require "rake/rdoctask"
@@ -33,7 +20,7 @@ spec = Gem::Specification.new do |s|
33
20
 
34
21
  # Change these as appropriate
35
22
  s.name = "southeastern-daily-performance"
36
- s.version = "0.0.3"
23
+ s.version = "0.0.5"
37
24
  s.summary = "Converts Southeaster Daily Performance reports from HTML to CSV"
38
25
  s.author = "Chris Roos"
39
26
  s.email = "chris@seagul.co.uk"
@@ -53,7 +40,7 @@ spec = Gem::Specification.new do |s|
53
40
  s.add_dependency('hpricot')
54
41
 
55
42
  # If your tests use any gems, include them here
56
- # s.add_development_dependency("mocha") # for example
43
+ s.add_development_dependency("mocha") # for example
57
44
 
58
45
  # If you want to publish automatically to rubyforge, you'll may need
59
46
  # to tweak this, and the publishing task below too.
@@ -82,47 +69,4 @@ end
82
69
  desc 'Clear out RDoc and generated packages'
83
70
  task :clean => [:clobber_rdoc, :clobber_package] do
84
71
  rm "#{spec.name}.gemspec"
85
- end
86
-
87
- # If you want to publish to RubyForge automatically, here's a simple
88
- # task to help do that. If you don't, just get rid of this.
89
- # Be sure to set up your Rubyforge account details with the Rubyforge
90
- # gem; you'll need to run `rubyforge setup` and `rubyforge config` at
91
- # the very least.
92
- begin
93
- require "rake/contrib/sshpublisher"
94
- namespace :rubyforge do
95
-
96
- desc "Release gem and RDoc documentation to RubyForge"
97
- task :release => ["rubyforge:release:gem", "rubyforge:release:docs"]
98
-
99
- namespace :release do
100
- desc "Release a new version of this gem"
101
- task :gem => [:package] do
102
- require 'rubyforge'
103
- rubyforge = RubyForge.new
104
- rubyforge.configure
105
- rubyforge.login
106
- rubyforge.userconfig['release_notes'] = spec.summary
107
- path_to_gem = File.join(File.dirname(__FILE__), "pkg", "#{spec.name}-#{spec.version}.gem")
108
- puts "Publishing #{spec.name}-#{spec.version.to_s} to Rubyforge..."
109
- rubyforge.add_release(spec.rubyforge_project, spec.name, spec.version.to_s, path_to_gem)
110
- end
111
-
112
- desc "Publish RDoc to RubyForge."
113
- task :docs => [:rdoc] do
114
- config = YAML.load(
115
- File.read(File.expand_path('~/.rubyforge/user-config.yml'))
116
- )
117
-
118
- host = "#{config['username']}@rubyforge.org"
119
- remote_dir = "/var/www/gforge-projects/southeastern-daily-performance/" # Should be the same as the rubyforge project name
120
- local_dir = 'rdoc'
121
-
122
- Rake::SshDirPublisher.new(host, remote_dir, local_dir).upload
123
- end
124
- end
125
- end
126
- rescue LoadError
127
- puts "Rake SshDirPublisher is unavailable or your rubyforge environment is not configured."
128
- end
72
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ FORMATS = ['full', 'overview']
4
+
5
+ def print_usage_instruction_and_exit
6
+ puts "Usage: #{File.basename(__FILE__)} [#{FORMATS.join('|')}] html-input-directory csv-output-directory"
7
+ exit 1
8
+ end
9
+
10
+ format, html_input_dir, csv_output_dir = ARGV
11
+ print_usage_instruction_and_exit unless format && FORMATS.include?(format)
12
+ print_usage_instruction_and_exit unless html_input_dir && File.directory?(html_input_dir)
13
+ print_usage_instruction_and_exit unless csv_output_dir && File.directory?(csv_output_dir)
14
+
15
+ # ROOT = File.expand_path('../../', __FILE__)
16
+ # LIB_DIR = File.join(ROOT, 'lib')
17
+ # CONVERTOR = File.join(ROOT, 'bin', 'sedpr-to-csv')
18
+ OPTIONAL_ARGS = (format == 'overview') ? 'overview' : ''
19
+
20
+ Dir[File.join(html_input_dir, '*.html')].each do |file|
21
+ p file
22
+
23
+ html_filename = File.basename(file)
24
+ csv_filename = html_filename.sub(/\.html/, '.csv')
25
+ csv_filename = csv_filename.sub(/\.csv/, '.overview.csv') if format == 'overview'
26
+
27
+ # cmd = "ruby -I#{LIB_DIR} #{CONVERTOR} #{file} #{OPTIONAL_ARGS} > #{File.join(csv_output_dir, csv_filename)}"
28
+ cmd = "sedpr-to-csv #{file} #{OPTIONAL_ARGS} > #{File.join(csv_output_dir, csv_filename)}"
29
+ `#{cmd}`
30
+ end
@@ -1,12 +1,18 @@
1
1
  #! /usr/bin/env ruby
2
2
 
3
3
  require 'open-uri'
4
- require File.join(File.dirname(__FILE__), '..', 'lib', 'sedpr')
4
+ require 'southeastern_daily_performance'
5
5
 
6
6
  unless html_location = ARGV[0]
7
- puts "Usage: sedpr-to-csv location-of-html # file or uri"
7
+ msg = []
8
+ msg << "Usage: sedpr-to-csv location-of-html [overview]"
9
+ msg << " * location-of-html - Can be a file or url."
10
+ msg << " * overview - Optional. Generate the overview data."
11
+ puts msg.join("\n")
8
12
  exit 1
9
13
  end
10
14
 
15
+ format = ARGV[1]
16
+
11
17
  io = open(html_location)
12
- puts SoutheasternDailyPerformance::DailyPerformanceReport.new(io.read).to_csv
18
+ puts SoutheasternDailyPerformance::DailyPerformanceReport.new(io.read).to_csv(format)
@@ -0,0 +1,3 @@
1
+ require 'southeastern_daily_performance/affected_service'
2
+ require 'southeastern_daily_performance/affected_services_report'
3
+ require 'southeastern_daily_performance/daily_performance_report'
@@ -7,12 +7,13 @@ module SoutheasternDailyPerformance
7
7
 
8
8
  def initialize(reason_for_disruption, incident_text)
9
9
  @reason_for_disruption = reason_for_disruption
10
+ incident_text.gsub!(/–/, '-') # Replace ndash with normal dash
10
11
  if incident_text =~ /(\d\d:\d\d) (.*?) ?(?:-|to|\?) (.*)/
11
12
  @scheduled_start_time, @scheduled_start_station = $1, $2
12
13
  destination_and_effect_on_service = $3
13
14
  @scheduled_start_station.gsub!(/[^a-zA-Z ]/, '')
14
15
  reasons = [
15
- 'cancelled', 'started', 'delayed by', 'did not call', 'terminated at', 'diverted'
16
+ 'cancelled', 'started', 'delayed by', 'did not call', 'terminated( at)?', 'diverted', 'ran fast', 'called'
16
17
  ]
17
18
  matches = reasons.collect do |reason|
18
19
  destination_and_effect_on_service =~ /#{reason}/i
@@ -22,7 +22,7 @@ module SoutheasternDailyPerformance
22
22
  end
23
23
 
24
24
  def scheduled_services
25
- report[/(\d+) train services were scheduled/, 1].to_i
25
+ report[/(\d+) (train )?services were scheduled/, 1].to_i
26
26
  end
27
27
 
28
28
  def actual_services
@@ -30,17 +30,21 @@ module SoutheasternDailyPerformance
30
30
  end
31
31
 
32
32
  def services_within_five_minutes_of_schedule
33
- report[/(\d+)% of services ran within 5 minutes of schedule/, 1].to_i
33
+ report[/(\d+\.?\d+?)% of services ran within 5 minutes of schedule/, 1].to_f
34
34
  end
35
35
 
36
36
  def affected_services
37
37
  AffectedServicesReport.new(report_container.inner_html).affected_services
38
38
  end
39
39
 
40
- def to_csv
41
- affected_services.collect do |service|
42
- CSV.generate_line [date, service.reason_for_disruption, service.scheduled_start_time, service.scheduled_start_station, service.scheduled_destination_station, service.effect_on_service]
43
- end.join("\n")
40
+ def to_csv(type=nil)
41
+ if "#{type}" == 'overview'
42
+ CSV.generate_line [date, scheduled_services, actual_services, services_within_five_minutes_of_schedule]
43
+ else
44
+ affected_services.collect do |service|
45
+ CSV.generate_line [date, service.reason_for_disruption, service.scheduled_start_time, service.scheduled_start_station, service.scheduled_destination_station, service.effect_on_service]
46
+ end.join("\n")
47
+ end
44
48
  end
45
49
 
46
50
  private
@@ -50,7 +54,11 @@ module SoutheasternDailyPerformance
50
54
  end
51
55
 
52
56
  def report_container
53
- (@doc/'h1').first.parent.next_sibling
57
+ if (@doc/'#mainblock').any?
58
+ (@doc/'h1').first.next_sibling
59
+ else
60
+ (@doc/'h1').first.parent.next_sibling
61
+ end
54
62
  end
55
63
 
56
64
  end
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: southeastern-daily-performance
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 21
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 0
8
- - 3
9
- version: 0.0.3
9
+ - 5
10
+ version: 0.0.5
10
11
  platform: ruby
11
12
  authors:
12
13
  - Chris Roos
@@ -14,24 +15,41 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-10-09 00:00:00 +01:00
18
+ date: 2010-12-22 00:00:00 +00:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
21
22
  name: hpricot
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 3
27
30
  segments:
28
31
  - 0
29
32
  version: "0"
30
33
  type: :runtime
31
34
  version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: mocha
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :development
48
+ version_requirements: *id002
32
49
  description:
33
50
  email: chris@seagul.co.uk
34
51
  executables:
52
+ - convert-all-html-data
35
53
  - sedpr-to-csv
36
54
  extensions: []
37
55
 
@@ -40,10 +58,12 @@ extra_rdoc_files:
40
58
  files:
41
59
  - README.md
42
60
  - Rakefile
43
- - lib/sedpr/affected_service.rb
44
- - lib/sedpr/affected_services_report.rb
45
- - lib/sedpr/daily_performance_report.rb
46
- - lib/sedpr.rb
61
+ - lib/southeastern_daily_performance/affected_service.rb
62
+ - lib/southeastern_daily_performance/affected_services_report.rb
63
+ - lib/southeastern_daily_performance/daily_performance_report.rb
64
+ - lib/southeastern_daily_performance.rb
65
+ - bin/convert-all-html-data
66
+ - bin/sedpr-to-csv
47
67
  has_rdoc: true
48
68
  homepage: http://github.com/chrisroos/southeastern-daily-performance
49
69
  licenses: []
@@ -55,23 +75,27 @@ rdoc_options:
55
75
  require_paths:
56
76
  - lib
57
77
  required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
58
79
  requirements:
59
80
  - - ">="
60
81
  - !ruby/object:Gem::Version
82
+ hash: 3
61
83
  segments:
62
84
  - 0
63
85
  version: "0"
64
86
  required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
65
88
  requirements:
66
89
  - - ">="
67
90
  - !ruby/object:Gem::Version
91
+ hash: 3
68
92
  segments:
69
93
  - 0
70
94
  version: "0"
71
95
  requirements: []
72
96
 
73
97
  rubyforge_project: southeastern-daily-performance
74
- rubygems_version: 1.3.6
98
+ rubygems_version: 1.3.7
75
99
  signing_key:
76
100
  specification_version: 3
77
101
  summary: Converts Southeaster Daily Performance reports from HTML to CSV
@@ -1,3 +0,0 @@
1
- require File.expand_path '../sedpr/affected_service', __FILE__
2
- require File.expand_path '../sedpr/affected_services_report', __FILE__
3
- require File.expand_path '../sedpr/daily_performance_report', __FILE__