abcde-at-the-movies 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +33 -0
- data/Rakefile +15 -0
- data/VERSION +1 -0
- data/lib/at_the_movies.rb +7 -0
- data/lib/at_the_movies/interview.rb +13 -0
- data/lib/at_the_movies/parser.rb +19 -0
- data/lib/at_the_movies/parsers.rb +24 -0
- data/lib/at_the_movies/parsers/interview.rb +24 -0
- data/lib/at_the_movies/parsers/review.rb +58 -0
- data/lib/at_the_movies/review.rb +33 -0
- data/lib/at_the_movies/version.rb +3 -0
- data/lib/core_ext.rb +1 -0
- data/lib/core_ext/enumerable.rb +6 -0
- data/spec/at_the_movies/interview_spec.rb +33 -0
- data/spec/at_the_movies/parser_spec.rb +8 -0
- data/spec/at_the_movies/parsers/interview_spec.rb +17 -0
- data/spec/at_the_movies/parsers/review_spec.rb +17 -0
- data/spec/at_the_movies/parsers_spec.rb +36 -0
- data/spec/at_the_movies/review_spec.rb +66 -0
- data/spec/helpers/page.rb +10 -0
- data/spec/pages/reviews +940 -0
- data/spec/pages/s1533013.htm +2058 -0
- data/spec/pages/s2625654.htm +434 -0
- data/spec/pages/s2625717.htm +435 -0
- data/spec/pages/s2625733.htm +435 -0
- data/spec/pages/s2625742.htm +435 -0
- data/spec/pages/s2631026.htm +416 -0
- data/spec/pages/s2634329.htm +414 -0
- data/spec/pages/s2642594.htm +421 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +15 -0
- metadata +100 -0
data/README.markdown
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# at-the-movies
|
2
|
+
|
3
|
+
## Description:
|
4
|
+
|
5
|
+
Scraper for http://abc.net.au/atthemovies
|
6
|
+
|
7
|
+
## Usage (I guess):
|
8
|
+
|
9
|
+
review = AtTheMovies::Parsers.for('http://www.abc.net.au/atthemovies/txt/s1533013.htm')
|
10
|
+
review.title # "Brokeback Mountain"
|
11
|
+
|
12
|
+
latest = AtTheMovies::Review.latest
|
13
|
+
latest.first.title # "Public Enemies"
|
14
|
+
|
15
|
+
## License:
|
16
|
+
|
17
|
+
(The MIT License)
|
18
|
+
|
19
|
+
Copyright (c) 2009 Dylan Egan
|
20
|
+
|
21
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
22
|
+
this software and associated documentation files (the 'Software'), to deal in
|
23
|
+
the Software without restriction, including without limitation the rights to use,
|
24
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
|
25
|
+
Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
26
|
+
|
27
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
28
|
+
|
29
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
30
|
+
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
31
|
+
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
32
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
33
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "at-the-movies"
|
5
|
+
gemspec.summary = "Pulling that shit in yo!"
|
6
|
+
gemspec.description = "Scraper for http://abc.net.au/atthemovies"
|
7
|
+
gemspec.email = "dylanegan@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/abcde/at-the-movies"
|
9
|
+
gemspec.authors = ["Dylan Egan"]
|
10
|
+
gemspec.files = %w(README.markdown Rakefile VERSION) + Dir.glob("{lib,spec}/**/*")
|
11
|
+
gemspec.add_dependency 'mechanize'
|
12
|
+
end
|
13
|
+
rescue LoadError
|
14
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
15
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module AtTheMovies
|
2
|
+
class Interview
|
3
|
+
attr_reader :title, :date, :interviewee, :interviewer, :url
|
4
|
+
|
5
|
+
def initialize(title, date, interviewee, interviewer, url)
|
6
|
+
@title = title
|
7
|
+
@date = date
|
8
|
+
@interviewee = interviewee
|
9
|
+
@interviewer = interviewer
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module AtTheMovies
|
2
|
+
class ParserError < StandardError; end
|
3
|
+
class Parser
|
4
|
+
attr_reader :page
|
5
|
+
|
6
|
+
def self.parse(page)
|
7
|
+
parser = new(page)
|
8
|
+
parser.parse
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(page)
|
12
|
+
@page = page
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
raise NotImplementedError, "where am I?"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
%w( interview review ).each { |parser| require File.dirname(__FILE__) + "/parsers/#{parser}" }
|
2
|
+
|
3
|
+
module AtTheMovies
|
4
|
+
module Parsers
|
5
|
+
MAP = {
|
6
|
+
"review" => Review
|
7
|
+
}
|
8
|
+
|
9
|
+
def self.for(url, options = {})
|
10
|
+
tries = 0
|
11
|
+
begin
|
12
|
+
page = WWW::Mechanize.new.get(url)
|
13
|
+
rescue => e
|
14
|
+
retry if (tries += 1) < (options[:tries] || 5)
|
15
|
+
raise AtTheMovies::ParserError,
|
16
|
+
"Failed to parse #{url}. #{e.message}"
|
17
|
+
end
|
18
|
+
type = page.search('meta[@name="ABC-Author"]').first['content']
|
19
|
+
return if options[:only] and type != options[:only]
|
20
|
+
parser = MAP[type] || raise(ArgumentError, "Couldn't find a Parser class to parse a #{type.inspect} page")
|
21
|
+
parser.parse(page)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module AtTheMovies
|
2
|
+
module Parsers
|
3
|
+
class Interview < Parser
|
4
|
+
def parse
|
5
|
+
interviewer, interviewee = details
|
6
|
+
if interviewer && interviewee
|
7
|
+
AtTheMovies::Interview.new(title, date, interviewee, interviewer, @page.uri.to_s)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def details
|
12
|
+
@page.body.scan(/(Margaret|David) interviews (.*).<\/p>\r/).flatten
|
13
|
+
end
|
14
|
+
|
15
|
+
def title
|
16
|
+
@page.title.strip[15..-1]
|
17
|
+
end
|
18
|
+
|
19
|
+
def date
|
20
|
+
Date.parse(@page.search('meta[@name="Date"]').first['content'].gsub('/', '-'))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module AtTheMovies
|
2
|
+
module Parsers
|
3
|
+
class Review < Parser
|
4
|
+
RATINGS = {
|
5
|
+
"zero" => 0,
|
6
|
+
"half" => 0.5,
|
7
|
+
"one" => 1,
|
8
|
+
"one-and-a-half" => 1.5,
|
9
|
+
"two" => 2,
|
10
|
+
"two-and-a-half" => 2.5,
|
11
|
+
"three" => 3,
|
12
|
+
"three-and-a-half" => 3.5,
|
13
|
+
"four" => 4,
|
14
|
+
"four-and-a-half" => 4.5,
|
15
|
+
"five" => 5
|
16
|
+
}
|
17
|
+
|
18
|
+
def parse
|
19
|
+
return unless @page.body[/Review by/]
|
20
|
+
AtTheMovies::Review.new(title, classification, date, duration, genre, ratings, @page.uri.to_s)
|
21
|
+
end
|
22
|
+
|
23
|
+
def details
|
24
|
+
@details ||= @page.search('p.moviedetails').inner_html.strip
|
25
|
+
end
|
26
|
+
|
27
|
+
def title
|
28
|
+
@page.title.strip[15..-1]
|
29
|
+
end
|
30
|
+
|
31
|
+
def classification
|
32
|
+
details.scan(/Classification:<\/strong> ([[:alnum:][:punct:][:space:]]{1,10})<br>/).flatten.first
|
33
|
+
end
|
34
|
+
|
35
|
+
def date
|
36
|
+
Date.parse(@page.search('meta[@name="Date"]').first['content'].gsub('/', '-'))
|
37
|
+
end
|
38
|
+
|
39
|
+
def duration
|
40
|
+
details.scan(/Duration:<\/strong> ([0-9]{1,3})/).flatten.first.to_i
|
41
|
+
end
|
42
|
+
|
43
|
+
def genre
|
44
|
+
details.scan(/Genre:<\/strong> ([[:alnum:][:space:][:punct:]]+)<br><strong>Director:/).flatten.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def ratings
|
48
|
+
score = @page.search('p.score')
|
49
|
+
ratings = score.css('img').collect { |image| image['alt'][0..-7] }
|
50
|
+
ratings = score.css('p.score').text.scan(/Margaret|David/).inject_with_index({}) do |hash, person, index|
|
51
|
+
hash[person] = RATINGS[ratings[index]]
|
52
|
+
hash
|
53
|
+
end
|
54
|
+
ratings
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module AtTheMovies
|
2
|
+
class Review
|
3
|
+
attr_reader :title, :classification, :date, :duration, :genre, :ratings, :url
|
4
|
+
|
5
|
+
def self.latest
|
6
|
+
page = WWW::Mechanize.new.get('http://www.abc.net.au/atthemovies/review/')
|
7
|
+
page.search('table.sideRating').first.css('a').inject([]) do |array, a|
|
8
|
+
page = AtTheMovies::Parsers.for(a['href'], :only => "review")
|
9
|
+
array << page
|
10
|
+
array
|
11
|
+
end.compact
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(title, classification, date, duration, genre, ratings, url)
|
15
|
+
@title = title
|
16
|
+
@classification = classification
|
17
|
+
@duration = duration
|
18
|
+
@genre = genre
|
19
|
+
@date = date
|
20
|
+
@ratings = ratings
|
21
|
+
@url = url
|
22
|
+
end
|
23
|
+
|
24
|
+
def rating(rater = nil)
|
25
|
+
return total_rating unless rater
|
26
|
+
@ratings[rater.to_s.capitalize]
|
27
|
+
end
|
28
|
+
|
29
|
+
def total_rating
|
30
|
+
@ratings.inject(0) { |i,p| i += p[1]; i }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/core_ext.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/core_ext/enumerable'
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe AtTheMovies::Interview do
|
4
|
+
context "with a parsed interview" do
|
5
|
+
before do
|
6
|
+
url = 'http://www.abc.net.au/atthemovies/txt/s2642594.htm'
|
7
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
8
|
+
mech = WWW::Mechanize.new
|
9
|
+
page = mech.get(url)
|
10
|
+
@interview = AtTheMovies::Parsers::Interview.parse(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should have a title" do
|
14
|
+
@interview.title.should == "The Cove Interview"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should have a date" do
|
18
|
+
@interview.date.to_s.should == "2009-08-19"
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have an interviewee" do
|
22
|
+
@interview.interviewee.should == "filmmaker Louie Psihoyos"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should have an interviewer" do
|
26
|
+
@interview.interviewer.should == "David"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should have a url" do
|
30
|
+
@interview.url.should == "http://www.abc.net.au/atthemovies/txt/s2642594.htm"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
describe AtTheMovies::Parsers::Interview do
|
4
|
+
describe "parse" do
|
5
|
+
before do
|
6
|
+
url = 'http://www.abc.net.au/atthemovies/txt/s2642594.htm'
|
7
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
8
|
+
mech = WWW::Mechanize.new
|
9
|
+
page = mech.get(url)
|
10
|
+
@interview = AtTheMovies::Parsers::Interview.parse(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should return an Interview object" do
|
14
|
+
@interview.should be_an_instance_of(AtTheMovies::Interview)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
describe AtTheMovies::Parsers::Review do
|
4
|
+
describe "parse" do
|
5
|
+
before do
|
6
|
+
url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
|
7
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
8
|
+
mech = WWW::Mechanize.new
|
9
|
+
page = mech.get(url)
|
10
|
+
@review = AtTheMovies::Parsers::Review.parse(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should return a Review object" do
|
14
|
+
@review.should be_an_instance_of(AtTheMovies::Review)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe AtTheMovies::Parsers do
|
4
|
+
describe "for" do
|
5
|
+
context "with a review page" do
|
6
|
+
before do
|
7
|
+
url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
|
8
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
9
|
+
@parsed = AtTheMovies::Parsers.for(url)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return a Review object" do
|
13
|
+
@parsed.should be_an_instance_of(AtTheMovies::Review)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "using only" do
|
18
|
+
before do
|
19
|
+
url = 'http://www.abc.net.au/atthemovies/s2634329.htm'
|
20
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
21
|
+
@parsed = AtTheMovies::Parsers.for(url, :only => "review")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return nothing for review when page is an interview" do
|
25
|
+
@parsed.should be_nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "parsing issues" do
|
30
|
+
it "should try five times before reraising" do
|
31
|
+
WWW::Mechanize.should_receive(:new).exactly(5).times.and_raise(Errno::ECONNREFUSED)
|
32
|
+
lambda { AtTheMovies::Parsers.for('url') }.should raise_error(AtTheMovies::ParserError, "Failed to parse url. Connection refused")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe AtTheMovies::Review do
|
4
|
+
context "with a parsed review" do
|
5
|
+
before do
|
6
|
+
url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
|
7
|
+
FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
|
8
|
+
mech = WWW::Mechanize.new
|
9
|
+
page = mech.get(url)
|
10
|
+
@review = AtTheMovies::Parsers::Review.parse(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should have a title" do
|
14
|
+
@review.title.should == "Brokeback Mountain"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should have a classification" do
|
18
|
+
@review.classification.should == "M"
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have a date" do
|
22
|
+
@review.date.to_s.should == "2005-12-18"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should have a duration" do
|
26
|
+
@review.duration.should == 134
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should have a genre" do
|
30
|
+
@review.genre.should == "Drama"
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have a total rating" do
|
34
|
+
@review.rating.should == 10
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have a rating by Margaret" do
|
38
|
+
@review.rating(:margaret).should == 5
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should have a rating by David" do
|
42
|
+
@review.rating(:david).should == 5
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should have a url" do
|
46
|
+
@review.url.should == "http://www.abc.net.au/atthemovies/txt/s1533013.htm"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
context "finding" do
|
51
|
+
context "latest" do
|
52
|
+
before do
|
53
|
+
url = 'http://www.abc.net.au/atthemovies/review/'
|
54
|
+
FakeWeb.register_uri(:get, url, :response => cached_page('reviews'))
|
55
|
+
%w( s2625733 s2625742 s2625654 s2625717 s2634329 s2631026 ).each do |uri|
|
56
|
+
FakeWeb.register_uri(:get, "http://www.abc.net.au/atthemovies/txt/#{uri}.htm", :response => cached_page("#{uri}.htm"))
|
57
|
+
end
|
58
|
+
@reviews = AtTheMovies::Review.latest
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should contain the latest reviews only" do
|
62
|
+
@reviews.collect { |review| review.title }.should == ["Public Enemies", "Coraline", "My Sister's Keeper", "Cedar Boys"]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|