southeastern-daily-performance 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +27 -2
- data/Rakefile +3 -59
- data/bin/convert-all-html-data +30 -0
- data/bin/sedpr-to-csv +9 -3
- data/lib/southeastern_daily_performance.rb +3 -0
- data/lib/{sedpr → southeastern_daily_performance}/affected_service.rb +2 -1
- data/lib/{sedpr → southeastern_daily_performance}/affected_services_report.rb +0 -0
- data/lib/{sedpr → southeastern_daily_performance}/daily_performance_report.rb +15 -7
- metadata +32 -8
- data/lib/sedpr.rb +0 -3
data/README.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## Intro
|
2
|
+
|
3
|
+
Southeastern publish [Daily Performance reports](http://www.southeasternrailway.co.uk/your-journey/daily-performance/). They're not very usable as they are. This project currently contains the code I've used to parse that data and the parsed data itself.
|
4
|
+
|
5
|
+
I've also popped the data into [this Google Fusion table](http://www.google.com/fusiontables/DataSource?dsrcid=359310) and added a bit of a [query tool](http://chrisroos.github.com/southeastern-daily-performance/) and [dashboard](http://chrisroos.github.com/southeastern-daily-performance/dashboard.html) (of sorts).
|
6
|
+
|
1
7
|
## Installation
|
2
8
|
|
3
9
|
$ gem install southeastern-daily-performance -r http://gemcutter.org
|
@@ -13,7 +19,26 @@
|
|
13
19
|
$ curl "http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132" > sedpr.html
|
14
20
|
$ sedpr-to-csv sedpr.html
|
15
21
|
|
16
|
-
|
17
22
|
### Implicitly download html and convert
|
18
23
|
|
19
|
-
$ sedpr-to-csv http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132
|
24
|
+
$ sedpr-to-csv http://www.southeasternrailway.co.uk/index.php/cms/pages/view/132
|
25
|
+
|
26
|
+
## Notes
|
27
|
+
|
28
|
+
### Combining all csv files into one big file
|
29
|
+
|
30
|
+
$ echo "Date,Problem,Scheduled departure time,Scheduled departure station,Scheduled arrival station,Affect on service" > combined.csv
|
31
|
+
$ cat /path/to/csv/files/*.csv >> combined.csv
|
32
|
+
|
33
|
+
### Combining all csv overview files into one file
|
34
|
+
|
35
|
+
$ echo "Date,Services scheduled,Services run,Services within 5 minutes of schedule" > combined.overview.csv
|
36
|
+
$ cat /path/to/csv/files/*.overview.csv >> combined.overview.csv
|
37
|
+
|
38
|
+
## TODO
|
39
|
+
|
40
|
+
* 2010-04-21 breaks the parser...
|
41
|
+
* Ensure that I'm generating valid CSV (specifically that I'm quoting columns that contain commas)
|
42
|
+
* I currently have an empty file for 2010-11-30.csv. Investigate the cause.
|
43
|
+
* Republish the gem
|
44
|
+
* I'm getting 0s for the parsed overview from 2010-11-30. Investigate the cause.
|
data/Rakefile
CHANGED
@@ -7,19 +7,6 @@ Rake::TestTask.new do |t|
|
|
7
7
|
t.verbose = true
|
8
8
|
end
|
9
9
|
|
10
|
-
require File.join(File.dirname(__FILE__), 'lib', 'sedpr')
|
11
|
-
task 'convert' do
|
12
|
-
if data_dir = ENV['DATA_DIR']
|
13
|
-
Dir[File.join(data_dir, '*.html')].each do |html_file|
|
14
|
-
html = File.read(html_file)
|
15
|
-
puts DailyPerformanceReport.new(html).to_csv
|
16
|
-
end
|
17
|
-
else
|
18
|
-
puts "Usage: DATA_DIR=/path/to/html-reports rake convert"
|
19
|
-
exit 1
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
10
|
require "rubygems"
|
24
11
|
require "rake/gempackagetask"
|
25
12
|
require "rake/rdoctask"
|
@@ -33,7 +20,7 @@ spec = Gem::Specification.new do |s|
|
|
33
20
|
|
34
21
|
# Change these as appropriate
|
35
22
|
s.name = "southeastern-daily-performance"
|
36
|
-
s.version = "0.0.
|
23
|
+
s.version = "0.0.5"
|
37
24
|
s.summary = "Converts Southeaster Daily Performance reports from HTML to CSV"
|
38
25
|
s.author = "Chris Roos"
|
39
26
|
s.email = "chris@seagul.co.uk"
|
@@ -53,7 +40,7 @@ spec = Gem::Specification.new do |s|
|
|
53
40
|
s.add_dependency('hpricot')
|
54
41
|
|
55
42
|
# If your tests use any gems, include them here
|
56
|
-
|
43
|
+
s.add_development_dependency("mocha") # for example
|
57
44
|
|
58
45
|
# If you want to publish automatically to rubyforge, you'll may need
|
59
46
|
# to tweak this, and the publishing task below too.
|
@@ -82,47 +69,4 @@ end
|
|
82
69
|
desc 'Clear out RDoc and generated packages'
|
83
70
|
task :clean => [:clobber_rdoc, :clobber_package] do
|
84
71
|
rm "#{spec.name}.gemspec"
|
85
|
-
end
|
86
|
-
|
87
|
-
# If you want to publish to RubyForge automatically, here's a simple
|
88
|
-
# task to help do that. If you don't, just get rid of this.
|
89
|
-
# Be sure to set up your Rubyforge account details with the Rubyforge
|
90
|
-
# gem; you'll need to run `rubyforge setup` and `rubyforge config` at
|
91
|
-
# the very least.
|
92
|
-
begin
|
93
|
-
require "rake/contrib/sshpublisher"
|
94
|
-
namespace :rubyforge do
|
95
|
-
|
96
|
-
desc "Release gem and RDoc documentation to RubyForge"
|
97
|
-
task :release => ["rubyforge:release:gem", "rubyforge:release:docs"]
|
98
|
-
|
99
|
-
namespace :release do
|
100
|
-
desc "Release a new version of this gem"
|
101
|
-
task :gem => [:package] do
|
102
|
-
require 'rubyforge'
|
103
|
-
rubyforge = RubyForge.new
|
104
|
-
rubyforge.configure
|
105
|
-
rubyforge.login
|
106
|
-
rubyforge.userconfig['release_notes'] = spec.summary
|
107
|
-
path_to_gem = File.join(File.dirname(__FILE__), "pkg", "#{spec.name}-#{spec.version}.gem")
|
108
|
-
puts "Publishing #{spec.name}-#{spec.version.to_s} to Rubyforge..."
|
109
|
-
rubyforge.add_release(spec.rubyforge_project, spec.name, spec.version.to_s, path_to_gem)
|
110
|
-
end
|
111
|
-
|
112
|
-
desc "Publish RDoc to RubyForge."
|
113
|
-
task :docs => [:rdoc] do
|
114
|
-
config = YAML.load(
|
115
|
-
File.read(File.expand_path('~/.rubyforge/user-config.yml'))
|
116
|
-
)
|
117
|
-
|
118
|
-
host = "#{config['username']}@rubyforge.org"
|
119
|
-
remote_dir = "/var/www/gforge-projects/southeastern-daily-performance/" # Should be the same as the rubyforge project name
|
120
|
-
local_dir = 'rdoc'
|
121
|
-
|
122
|
-
Rake::SshDirPublisher.new(host, remote_dir, local_dir).upload
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
rescue LoadError
|
127
|
-
puts "Rake SshDirPublisher is unavailable or your rubyforge environment is not configured."
|
128
|
-
end
|
72
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
FORMATS = ['full', 'overview']
|
4
|
+
|
5
|
+
def print_usage_instruction_and_exit
|
6
|
+
puts "Usage: #{File.basename(__FILE__)} [#{FORMATS.join('|')}] html-input-directory csv-output-directory"
|
7
|
+
exit 1
|
8
|
+
end
|
9
|
+
|
10
|
+
format, html_input_dir, csv_output_dir = ARGV
|
11
|
+
print_usage_instruction_and_exit unless format && FORMATS.include?(format)
|
12
|
+
print_usage_instruction_and_exit unless html_input_dir && File.directory?(html_input_dir)
|
13
|
+
print_usage_instruction_and_exit unless csv_output_dir && File.directory?(csv_output_dir)
|
14
|
+
|
15
|
+
# ROOT = File.expand_path('../../', __FILE__)
|
16
|
+
# LIB_DIR = File.join(ROOT, 'lib')
|
17
|
+
# CONVERTOR = File.join(ROOT, 'bin', 'sedpr-to-csv')
|
18
|
+
OPTIONAL_ARGS = (format == 'overview') ? 'overview' : ''
|
19
|
+
|
20
|
+
Dir[File.join(html_input_dir, '*.html')].each do |file|
|
21
|
+
p file
|
22
|
+
|
23
|
+
html_filename = File.basename(file)
|
24
|
+
csv_filename = html_filename.sub(/\.html/, '.csv')
|
25
|
+
csv_filename = csv_filename.sub(/\.csv/, '.overview.csv') if format == 'overview'
|
26
|
+
|
27
|
+
# cmd = "ruby -I#{LIB_DIR} #{CONVERTOR} #{file} #{OPTIONAL_ARGS} > #{File.join(csv_output_dir, csv_filename)}"
|
28
|
+
cmd = "sedpr-to-csv #{file} #{OPTIONAL_ARGS} > #{File.join(csv_output_dir, csv_filename)}"
|
29
|
+
`#{cmd}`
|
30
|
+
end
|
data/bin/sedpr-to-csv
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
|
-
require
|
4
|
+
require 'southeastern_daily_performance'
|
5
5
|
|
6
6
|
unless html_location = ARGV[0]
|
7
|
-
|
7
|
+
msg = []
|
8
|
+
msg << "Usage: sedpr-to-csv location-of-html [overview]"
|
9
|
+
msg << " * location-of-html - Can be a file or url."
|
10
|
+
msg << " * overview - Optional. Generate the overview data."
|
11
|
+
puts msg.join("\n")
|
8
12
|
exit 1
|
9
13
|
end
|
10
14
|
|
15
|
+
format = ARGV[1]
|
16
|
+
|
11
17
|
io = open(html_location)
|
12
|
-
puts SoutheasternDailyPerformance::DailyPerformanceReport.new(io.read).to_csv
|
18
|
+
puts SoutheasternDailyPerformance::DailyPerformanceReport.new(io.read).to_csv(format)
|
@@ -7,12 +7,13 @@ module SoutheasternDailyPerformance
|
|
7
7
|
|
8
8
|
def initialize(reason_for_disruption, incident_text)
|
9
9
|
@reason_for_disruption = reason_for_disruption
|
10
|
+
incident_text.gsub!(/–/, '-') # Replace ndash with normal dash
|
10
11
|
if incident_text =~ /(\d\d:\d\d) (.*?) ?(?:-|to|\?) (.*)/
|
11
12
|
@scheduled_start_time, @scheduled_start_station = $1, $2
|
12
13
|
destination_and_effect_on_service = $3
|
13
14
|
@scheduled_start_station.gsub!(/[^a-zA-Z ]/, '')
|
14
15
|
reasons = [
|
15
|
-
'cancelled', 'started', 'delayed by', 'did not call', 'terminated at', 'diverted'
|
16
|
+
'cancelled', 'started', 'delayed by', 'did not call', 'terminated( at)?', 'diverted', 'ran fast', 'called'
|
16
17
|
]
|
17
18
|
matches = reasons.collect do |reason|
|
18
19
|
destination_and_effect_on_service =~ /#{reason}/i
|
File without changes
|
@@ -22,7 +22,7 @@ module SoutheasternDailyPerformance
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def scheduled_services
|
25
|
-
report[/(\d+) train services were scheduled/, 1].to_i
|
25
|
+
report[/(\d+) (train )?services were scheduled/, 1].to_i
|
26
26
|
end
|
27
27
|
|
28
28
|
def actual_services
|
@@ -30,17 +30,21 @@ module SoutheasternDailyPerformance
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def services_within_five_minutes_of_schedule
|
33
|
-
report[/(\d
|
33
|
+
report[/(\d+\.?\d+?)% of services ran within 5 minutes of schedule/, 1].to_f
|
34
34
|
end
|
35
35
|
|
36
36
|
def affected_services
|
37
37
|
AffectedServicesReport.new(report_container.inner_html).affected_services
|
38
38
|
end
|
39
39
|
|
40
|
-
def to_csv
|
41
|
-
|
42
|
-
CSV.generate_line [date,
|
43
|
-
|
40
|
+
def to_csv(type=nil)
|
41
|
+
if "#{type}" == 'overview'
|
42
|
+
CSV.generate_line [date, scheduled_services, actual_services, services_within_five_minutes_of_schedule]
|
43
|
+
else
|
44
|
+
affected_services.collect do |service|
|
45
|
+
CSV.generate_line [date, service.reason_for_disruption, service.scheduled_start_time, service.scheduled_start_station, service.scheduled_destination_station, service.effect_on_service]
|
46
|
+
end.join("\n")
|
47
|
+
end
|
44
48
|
end
|
45
49
|
|
46
50
|
private
|
@@ -50,7 +54,11 @@ module SoutheasternDailyPerformance
|
|
50
54
|
end
|
51
55
|
|
52
56
|
def report_container
|
53
|
-
(@doc/'
|
57
|
+
if (@doc/'#mainblock').any?
|
58
|
+
(@doc/'h1').first.next_sibling
|
59
|
+
else
|
60
|
+
(@doc/'h1').first.parent.next_sibling
|
61
|
+
end
|
54
62
|
end
|
55
63
|
|
56
64
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: southeastern-daily-performance
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Chris Roos
|
@@ -14,24 +15,41 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-
|
18
|
+
date: 2010-12-22 00:00:00 +00:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
22
|
name: hpricot
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
27
30
|
segments:
|
28
31
|
- 0
|
29
32
|
version: "0"
|
30
33
|
type: :runtime
|
31
34
|
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: mocha
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
32
49
|
description:
|
33
50
|
email: chris@seagul.co.uk
|
34
51
|
executables:
|
52
|
+
- convert-all-html-data
|
35
53
|
- sedpr-to-csv
|
36
54
|
extensions: []
|
37
55
|
|
@@ -40,10 +58,12 @@ extra_rdoc_files:
|
|
40
58
|
files:
|
41
59
|
- README.md
|
42
60
|
- Rakefile
|
43
|
-
- lib/
|
44
|
-
- lib/
|
45
|
-
- lib/
|
46
|
-
- lib/
|
61
|
+
- lib/southeastern_daily_performance/affected_service.rb
|
62
|
+
- lib/southeastern_daily_performance/affected_services_report.rb
|
63
|
+
- lib/southeastern_daily_performance/daily_performance_report.rb
|
64
|
+
- lib/southeastern_daily_performance.rb
|
65
|
+
- bin/convert-all-html-data
|
66
|
+
- bin/sedpr-to-csv
|
47
67
|
has_rdoc: true
|
48
68
|
homepage: http://github.com/chrisroos/southeastern-daily-performance
|
49
69
|
licenses: []
|
@@ -55,23 +75,27 @@ rdoc_options:
|
|
55
75
|
require_paths:
|
56
76
|
- lib
|
57
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
58
79
|
requirements:
|
59
80
|
- - ">="
|
60
81
|
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
61
83
|
segments:
|
62
84
|
- 0
|
63
85
|
version: "0"
|
64
86
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
65
88
|
requirements:
|
66
89
|
- - ">="
|
67
90
|
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
68
92
|
segments:
|
69
93
|
- 0
|
70
94
|
version: "0"
|
71
95
|
requirements: []
|
72
96
|
|
73
97
|
rubyforge_project: southeastern-daily-performance
|
74
|
-
rubygems_version: 1.3.
|
98
|
+
rubygems_version: 1.3.7
|
75
99
|
signing_key:
|
76
100
|
specification_version: 3
|
77
101
|
summary: Converts Southeaster Daily Performance reports from HTML to CSV
|
data/lib/sedpr.rb
DELETED