abcde-at-the-movies 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ # at-the-movies
2
+
3
+ ## Description:
4
+
5
+ Scraper for http://abc.net.au/atthemovies
6
+
7
+ ## Usage (I guess):
8
+
9
+ review = AtTheMovies::Parsers.for('http://www.abc.net.au/atthemovies/txt/s1533013.htm')
10
+ review.title # "Brokeback Mountain"
11
+
12
+ latest = AtTheMovies::Review.latest
13
+ latest.first.title # "Public Enemies"
14
+
15
+ ## License:
16
+
17
+ (The MIT License)
18
+
19
+ Copyright (c) 2009 Dylan Egan
20
+
21
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
22
+ this software and associated documentation files (the 'Software'), to deal in
23
+ the Software without restriction, including without limitation the rights to use,
24
+ copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
25
+ Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
26
+
27
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
28
+
29
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
30
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
31
+ PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
32
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
33
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,15 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "at-the-movies"
5
+ gemspec.summary = "Pulling that shit in yo!"
6
+ gemspec.description = "Scraper for http://abc.net.au/atthemovies"
7
+ gemspec.email = "dylanegan@gmail.com"
8
+ gemspec.homepage = "http://github.com/abcde/at-the-movies"
9
+ gemspec.authors = ["Dylan Egan"]
10
+ gemspec.files = %w(README.markdown Rakefile VERSION) + Dir.glob("{lib,spec}/**/*")
11
+ gemspec.add_dependency 'mechanize'
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,7 @@
1
+ require 'mechanize'
2
+
3
+ $:.unshift File.dirname(__FILE__) + '/at_the_movies'
4
+
5
+ %w( interview parser parsers review ).each { |lib| require lib }
6
+
7
+ require File.dirname(__FILE__) + '/core_ext'
@@ -0,0 +1,13 @@
1
+ module AtTheMovies
2
+ class Interview
3
+ attr_reader :title, :date, :interviewee, :interviewer, :url
4
+
5
+ def initialize(title, date, interviewee, interviewer, url)
6
+ @title = title
7
+ @date = date
8
+ @interviewee = interviewee
9
+ @interviewer = interviewer
10
+ @url = url
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module AtTheMovies
2
+ class ParserError < StandardError; end
3
+ class Parser
4
+ attr_reader :page
5
+
6
+ def self.parse(page)
7
+ parser = new(page)
8
+ parser.parse
9
+ end
10
+
11
+ def initialize(page)
12
+ @page = page
13
+ end
14
+
15
+ def parse
16
+ raise NotImplementedError, "where am I?"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,24 @@
1
+ %w( interview review ).each { |parser| require File.dirname(__FILE__) + "/parsers/#{parser}" }
2
+
3
+ module AtTheMovies
4
+ module Parsers
5
+ MAP = {
6
+ "review" => Review
7
+ }
8
+
9
+ def self.for(url, options = {})
10
+ tries = 0
11
+ begin
12
+ page = WWW::Mechanize.new.get(url)
13
+ rescue => e
14
+ retry if (tries += 1) < (options[:tries] || 5)
15
+ raise AtTheMovies::ParserError,
16
+ "Failed to parse #{url}. #{e.message}"
17
+ end
18
+ type = page.search('meta[@name="ABC-Author"]').first['content']
19
+ return if options[:only] and type != options[:only]
20
+ parser = MAP[type] || raise(ArgumentError, "Couldn't find a Parser class to parse a #{type.inspect} page")
21
+ parser.parse(page)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,24 @@
1
+ module AtTheMovies
2
+ module Parsers
3
+ class Interview < Parser
4
+ def parse
5
+ interviewer, interviewee = details
6
+ if interviewer && interviewee
7
+ AtTheMovies::Interview.new(title, date, interviewee, interviewer, @page.uri.to_s)
8
+ end
9
+ end
10
+
11
+ def details
12
+ @page.body.scan(/(Margaret|David) interviews (.*).<\/p>\r/).flatten
13
+ end
14
+
15
+ def title
16
+ @page.title.strip[15..-1]
17
+ end
18
+
19
+ def date
20
+ Date.parse(@page.search('meta[@name="Date"]').first['content'].gsub('/', '-'))
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,58 @@
1
+ module AtTheMovies
2
+ module Parsers
3
+ class Review < Parser
4
+ RATINGS = {
5
+ "zero" => 0,
6
+ "half" => 0.5,
7
+ "one" => 1,
8
+ "one-and-a-half" => 1.5,
9
+ "two" => 2,
10
+ "two-and-a-half" => 2.5,
11
+ "three" => 3,
12
+ "three-and-a-half" => 3.5,
13
+ "four" => 4,
14
+ "four-and-a-half" => 4.5,
15
+ "five" => 5
16
+ }
17
+
18
+ def parse
19
+ return unless @page.body[/Review by/]
20
+ AtTheMovies::Review.new(title, classification, date, duration, genre, ratings, @page.uri.to_s)
21
+ end
22
+
23
+ def details
24
+ @details ||= @page.search('p.moviedetails').inner_html.strip
25
+ end
26
+
27
+ def title
28
+ @page.title.strip[15..-1]
29
+ end
30
+
31
+ def classification
32
+ details.scan(/Classification:<\/strong> ([[:alnum:][:punct:][:space:]]{1,10})<br>/).flatten.first
33
+ end
34
+
35
+ def date
36
+ Date.parse(@page.search('meta[@name="Date"]').first['content'].gsub('/', '-'))
37
+ end
38
+
39
+ def duration
40
+ details.scan(/Duration:<\/strong> ([0-9]{1,3})/).flatten.first.to_i
41
+ end
42
+
43
+ def genre
44
+ details.scan(/Genre:<\/strong> ([[:alnum:][:space:][:punct:]]+)<br><strong>Director:/).flatten.first
45
+ end
46
+
47
+ def ratings
48
+ score = @page.search('p.score')
49
+ ratings = score.css('img').collect { |image| image['alt'][0..-7] }
50
+ ratings = score.css('p.score').text.scan(/Margaret|David/).inject_with_index({}) do |hash, person, index|
51
+ hash[person] = RATINGS[ratings[index]]
52
+ hash
53
+ end
54
+ ratings
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,33 @@
1
+ module AtTheMovies
2
+ class Review
3
+ attr_reader :title, :classification, :date, :duration, :genre, :ratings, :url
4
+
5
+ def self.latest
6
+ page = WWW::Mechanize.new.get('http://www.abc.net.au/atthemovies/review/')
7
+ page.search('table.sideRating').first.css('a').inject([]) do |array, a|
8
+ page = AtTheMovies::Parsers.for(a['href'], :only => "review")
9
+ array << page
10
+ array
11
+ end.compact
12
+ end
13
+
14
+ def initialize(title, classification, date, duration, genre, ratings, url)
15
+ @title = title
16
+ @classification = classification
17
+ @duration = duration
18
+ @genre = genre
19
+ @date = date
20
+ @ratings = ratings
21
+ @url = url
22
+ end
23
+
24
+ def rating(rater = nil)
25
+ return total_rating unless rater
26
+ @ratings[rater.to_s.capitalize]
27
+ end
28
+
29
+ def total_rating
30
+ @ratings.inject(0) { |i,p| i += p[1]; i }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module AtTheMovies
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + '/core_ext/enumerable'
@@ -0,0 +1,6 @@
1
+ module Enumerable
2
+ def inject_with_index(injected)
3
+ each_with_index{ |obj, index| injected = yield(injected, obj, index) }
4
+ injected
5
+ end
6
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AtTheMovies::Interview do
4
+ context "with a parsed interview" do
5
+ before do
6
+ url = 'http://www.abc.net.au/atthemovies/txt/s2642594.htm'
7
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
8
+ mech = WWW::Mechanize.new
9
+ page = mech.get(url)
10
+ @interview = AtTheMovies::Parsers::Interview.parse(page)
11
+ end
12
+
13
+ it "should have a title" do
14
+ @interview.title.should == "The Cove Interview"
15
+ end
16
+
17
+ it "should have a date" do
18
+ @interview.date.to_s.should == "2009-08-19"
19
+ end
20
+
21
+ it "should have an interviewee" do
22
+ @interview.interviewee.should == "filmmaker Louie Psihoyos"
23
+ end
24
+
25
+ it "should have an interviewer" do
26
+ @interview.interviewer.should == "David"
27
+ end
28
+
29
+ it "should have a url" do
30
+ @interview.url.should == "http://www.abc.net.au/atthemovies/txt/s2642594.htm"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AtTheMovies::Parser do
4
+ it "should act as a base class" do
5
+ parser = AtTheMovies::Parser.new(nil)
6
+ lambda { parser.parse }.should raise_error(NotImplementedError)
7
+ end
8
+ end
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ describe AtTheMovies::Parsers::Interview do
4
+ describe "parse" do
5
+ before do
6
+ url = 'http://www.abc.net.au/atthemovies/txt/s2642594.htm'
7
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
8
+ mech = WWW::Mechanize.new
9
+ page = mech.get(url)
10
+ @interview = AtTheMovies::Parsers::Interview.parse(page)
11
+ end
12
+
13
+ it "should return an Interview object" do
14
+ @interview.should be_an_instance_of(AtTheMovies::Interview)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ describe AtTheMovies::Parsers::Review do
4
+ describe "parse" do
5
+ before do
6
+ url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
7
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
8
+ mech = WWW::Mechanize.new
9
+ page = mech.get(url)
10
+ @review = AtTheMovies::Parsers::Review.parse(page)
11
+ end
12
+
13
+ it "should return a Review object" do
14
+ @review.should be_an_instance_of(AtTheMovies::Review)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,36 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AtTheMovies::Parsers do
4
+ describe "for" do
5
+ context "with a review page" do
6
+ before do
7
+ url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
8
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
9
+ @parsed = AtTheMovies::Parsers.for(url)
10
+ end
11
+
12
+ it "should return a Review object" do
13
+ @parsed.should be_an_instance_of(AtTheMovies::Review)
14
+ end
15
+ end
16
+
17
+ context "using only" do
18
+ before do
19
+ url = 'http://www.abc.net.au/atthemovies/s2634329.htm'
20
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
21
+ @parsed = AtTheMovies::Parsers.for(url, :only => "review")
22
+ end
23
+
24
+ it "should return nothing for review when page is an interview" do
25
+ @parsed.should be_nil
26
+ end
27
+ end
28
+
29
+ context "parsing issues" do
30
+ it "should try five times before reraising" do
31
+ WWW::Mechanize.should_receive(:new).exactly(5).times.and_raise(Errno::ECONNREFUSED)
32
+ lambda { AtTheMovies::Parsers.for('url') }.should raise_error(AtTheMovies::ParserError, "Failed to parse url. Connection refused")
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,66 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AtTheMovies::Review do
4
+ context "with a parsed review" do
5
+ before do
6
+ url = 'http://www.abc.net.au/atthemovies/txt/s1533013.htm'
7
+ FakeWeb.register_uri(:get, url, :response => cached_page_for(url))
8
+ mech = WWW::Mechanize.new
9
+ page = mech.get(url)
10
+ @review = AtTheMovies::Parsers::Review.parse(page)
11
+ end
12
+
13
+ it "should have a title" do
14
+ @review.title.should == "Brokeback Mountain"
15
+ end
16
+
17
+ it "should have a classification" do
18
+ @review.classification.should == "M"
19
+ end
20
+
21
+ it "should have a date" do
22
+ @review.date.to_s.should == "2005-12-18"
23
+ end
24
+
25
+ it "should have a duration" do
26
+ @review.duration.should == 134
27
+ end
28
+
29
+ it "should have a genre" do
30
+ @review.genre.should == "Drama"
31
+ end
32
+
33
+ it "should have a total rating" do
34
+ @review.rating.should == 10
35
+ end
36
+
37
+ it "should have a rating by Margaret" do
38
+ @review.rating(:margaret).should == 5
39
+ end
40
+
41
+ it "should have a rating by David" do
42
+ @review.rating(:david).should == 5
43
+ end
44
+
45
+ it "should have a url" do
46
+ @review.url.should == "http://www.abc.net.au/atthemovies/txt/s1533013.htm"
47
+ end
48
+ end
49
+
50
+ context "finding" do
51
+ context "latest" do
52
+ before do
53
+ url = 'http://www.abc.net.au/atthemovies/review/'
54
+ FakeWeb.register_uri(:get, url, :response => cached_page('reviews'))
55
+ %w( s2625733 s2625742 s2625654 s2625717 s2634329 s2631026 ).each do |uri|
56
+ FakeWeb.register_uri(:get, "http://www.abc.net.au/atthemovies/txt/#{uri}.htm", :response => cached_page("#{uri}.htm"))
57
+ end
58
+ @reviews = AtTheMovies::Review.latest
59
+ end
60
+
61
+ it "should contain the latest reviews only" do
62
+ @reviews.collect { |review| review.title }.should == ["Public Enemies", "Coraline", "My Sister's Keeper", "Cedar Boys"]
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,10 @@
1
+ module PageHelperMethods
2
+ def cached_page_for(url)
3
+ page = url[/s[0-9]{1,}.htm/]
4
+ cached_page(page)
5
+ end
6
+
7
+ def cached_page(name)
8
+ SPEC_DIR + "/pages/#{name}"
9
+ end
10
+ end