RubyGems - royw-imdb - Versions diffs - 0.0.8 - Mend

royw-imdb 0.0.8

Files changed (9) hide show

data/README ADDED Viewed

@@ -0,0 +1,77 @@
+ImdbMovie Indiana Jones and the Last Crusade
+- should query IMDB url
+- should get the title
+- should get director(s)
+- should get the poster url
+- should return an ImdbImage object
+- should get the rating
+- should get cast members
+- should get the writers
+- should get the year
+- should get the release date
+- should get the genres
+- should get the plot
+- should get the length
+- should get the countries
+- should get the languages
+- should get the color
+- should get the company
+- should get some photos
+- should get the tagline
+- should get the aspect ratio
+ImdbMovie Han robado una estrella
+- should query IMDB url
+- should get the title
+- should get director(s)
+- should not get the poster
+- should get cast members
+- should get the writers
+- should get the release date
+- should get the genres
+- should not get the plot
+- should get the length
+- should get the countries
+- should get the languages
+- should not get the color
+- should get the company
+- should not get any photos
+ImdbSearch search that returns multiple movies
+- should query IMDB url
+ImdbSearch search that returns multiple movies movies
+- should be a collection of ImdbMovie instances
+- should include 'Indiana Jones and the Last Crusade'
+- should not have titles with HTML tags
+- should not have duplicate movies
+ImdbSearch search that redirects to the lone matching movie movies
+- should be a collection containing a single ImdbMovie instance
+- should have the correct ID
+- should have the correct title
+ImdbSearch searches that match on AKA title movies
+- should have multiple movies
+- should find id tt0127357
+- should have only one movie from 1998
+ImdbSearch searches that match on AKA title but without search_aka enabled movies
+- should have multiple movies
+- should have 2 movies from 1998
+ImdbMovie Indiana Jones and the Last Crusade
+- should query IMDB url
+- should get the image
+String unescape_html
+- should convert &amp; to &
+- should convert &#243; to ó
+String strip_tags
+- should strip HTML tags
+Finished in 5.222681 seconds
+53 examples, 0 failures

data/lib/file_extensions.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# == Synopsis
+# add a mkdirs method to the File class
+class File
+  ##
+  # make directories including any missing in the path
+  #
+  # @param [String] dirspec the path to make sure exists
+  def File.mkdirs(dirspec)
+    unless File.exists?(dirspec)
+      mkdirs(File.dirname(dirspec))
+      Dir.mkdir(dirspec)
+    end
+  end
+end

data/lib/imdb.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'open-uri'
+require 'date'
+require 'cgi'
+require 'rubygems'
+require 'hpricot'
+require 'chronic'
+require File.dirname(__FILE__) + '/imdb/imdb_search'
+require File.dirname(__FILE__) + '/imdb/imdb_movie'
+require File.dirname(__FILE__) + '/imdb/imdb_image'
+require File.dirname(__FILE__) + '/string_extensions'
+require File.dirname(__FILE__) + '/file_extensions'
+require File.dirname(__FILE__) + '/object_extensions'

data/lib/imdb/imdb_image.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# @imdb_movie.poster.should == 'http://ia.media-imdb.com/images/M/MV5BMTkzODA5ODYwOV5BMl5BanBnXkFtZTcwMjAyNDYyMQ@@._V1._SX216_SY316_.jpg'
+class ImdbImage
+  attr_accessor :url
+  def initialize(url)
+    @url = File.join("http://www.imdb.com/", url)
+  end
+  def image
+    document.at("table#principal tr td img")['src'] rescue nil
+  end
+  def document
+    @document ||= Hpricot(open(self.url).read)
+  end
+end

data/lib/imdb/imdb_movie.rb ADDED Viewed

@@ -0,0 +1,224 @@
+class ImdbMovie
+  include Comparable
+  attr_reader :id, :url#, :title
+  def initialize(id, title = nil)
+    @id = id
+#     @url = "http://www.imdb.com/title/tt#{@id}/"
+    @url = sprintf(ImdbMovie::url_format, @id.to_s)
+    @title = title
+  end
+  # this is intended to be stubed by rspec where it
+  # should return the path to the cached html file
+  # Note, the returned String should have one '%s'
+  # which will replaced by sprintf with @id.to_s
+  def self.url_format
+    'http://www.imdb.com/title/tt%s/'
+  end
+  # this is intended to be stubed by rspec where it
+  # should return true.
+  def self.use_html_cache
+    false
+  end
+  # add comparator so Arrays containing ImdbMovie objects
+  # can use uniq()
+  def <=>(other)
+    @id <=> other.id
+  end
+  def title
+    if @title.nil?
+      @title = document.at("div#tn15title h1").innerHTML.split('<span>').first.strip.unescape_html rescue nil
+    end
+    @title
+  end
+  def directors
+    document.search("h5[text()^='Director'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
+  end
+  def poster_url
+    document.at("a[@name='poster']")['href'] rescue nil
+  end
+  def tiny_poster_url
+    document.at("a[@name='poster'] img")['src'] rescue nil
+  end
+  def poster
+    ImdbImage.new(poster_url) rescue nil
+  end
+  def rating
+    document.at("h5[text()='User Rating:'] ~ b").innerHTML.strip.unescape_html.split('/').first.to_f rescue nil
+  end
+  def cast_members
+    # document.search("table.cast td.nm a").map { |link| link.innerHTML.strip.unescape_html } rescue []
+    document.search("table.cast tr").inject([]) do |result, row|
+      a = row.search("td.nm a").innerHTML.strip.unescape_html
+      c = row.search("td.char a").innerHTML.strip.unescape_html
+      if c.empty?
+        c = row.search("td.char").innerHTML.strip.unescape_html
+      end
+      result << [a,c]
+    end
+  end
+  def writers
+    document.search("h5[text()^='Writer'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
+  end
+  def year
+    document.search('a[@href^="/Sections/Years/"]').innerHTML
+  end
+  def release_date
+    date = document.search("//h5[text()^='Release Date']/..").innerHTML[/^\d{1,2} \w+ \d{4}/]
+    Date.parse(Chronic.parse(date).strftime('%Y/%m/%d'))
+  rescue
+    nil
+  end
+  def genres
+    document.search("h5[text()='Genre:'] ~ a[@href*=/Sections/Genres/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def plot
+    document.search("//h5[text()^='Plot']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def tagline
+    document.search("//h5[text()^='Tagline']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def aspect_ratio
+    document.search("//h5[text()^='Aspect Ratio']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def length
+    document.search("//h5[text()^='Runtime']/..").innerHTML[/\d+ min/] rescue nil
+  end
+  def countries
+    document.search("h5[text()='Country:'] ~ a[@href*=/Sections/Countries/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def languages
+    document.search("h5[text()='Language:'] ~ a[@href*=/Sections/Languages/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def color
+    document.at("h5[text()='Color:'] ~ a[@href*=color-info']").innerHTML.strip.unescape_html rescue nil
+  end
+  def company
+    document.at("h5[text()='Company:'] ~ a[@href*=/company/']").innerHTML.strip.unescape_html rescue nil
+  end
+  def photos
+    document.search(".media_strip_thumb img").map { |img| img['src'] } rescue []
+  end
+#   def get_data
+#     update_title
+#   end
+#   def title2
+#     document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
+#   end
+  # return the raw title
+  def raw_title
+    document.at("h1").innerText
+  end
+  # is this a video game as indicated by a '(VG)' in the raw title?
+  def video_game?
+    raw_title =~ /\(VG\)/
+  end
+  # find the release year
+  # Note, this is needed because not all entries on IMDB have a full
+  # release date as parsed by release_date.
+  def release_year
+    document.search("//h5[text()^='Release Date']/..").innerHTML[/\d{4}/]
+  end
+  # return an Array of Strings containing AKA titles
+  def also_known_as
+    el = document.search("//h5[text()^='Also Known As:']/..").at('h5')
+    aka = []
+    while(!el.nil?)
+      aka << el.to_s unless el.elem?
+      el = el.next
+    end
+    aka.collect!{|a| a.gsub(/\([^\)]*\)/, '').strip}
+    aka.uniq!
+    aka.compact!
+    aka.select{|a| !a.empty?}
+  end
+  # The MPAA rating, i.e. "PG-13"
+  def mpaa
+    document.search("//h5[text()^='MPAA']/..").text.gsub('MPAA:', '').strip rescue nil
+  end
+  # older films may not have MPAA ratings but usually have a certification.
+  # return a hash with country abbreviations for keys and the certification string for the value
+  # example:  {'USA' => 'Approved'}
+  def certifications
+    cert_hash = {}
+    certs = document.search("h5[text()='Certification:'] ~ a[@href*=/List?certificates']").map { |link| link.innerHTML.strip } rescue []
+    certs.each { |line| cert_hash[$1] = $2 if line =~ /(.*):(.*)/ }
+    cert_hash
+  end
+  private
+#   def update_title
+#     @title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
+#     #document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
+#   end
+  MAX_ATTEMPTS = 3
+  SECONDS_BETWEEN_RETRIES = 1.0
+  # Fetch the document with retry to handle the occasional glitches
+  def document
+    attempts = 0
+    begin
+      html = open(self.url).read
+      @document ||= Hpricot(html)
+      cache_html_files(html) if ImdbMovie::use_html_cache
+    rescue Exception => e
+      attempts += 1
+      if attempts > MAX_ATTEMPTS
+        raise
+      else
+        sleep SECONDS_BETWEEN_RETRIES
+        retry
+      end
+    end
+    @document
+  end
+  # this is used to save imdb pages so they may be used by rspec
+  def cache_html_files(html)
+    begin
+      filespec = self.url.gsub(/^http:\//, 'spec/samples').gsub(/\/$/, '.html')
+      unless File.exist?(filespec)
+        puts filespec
+        File.mkdirs(File.dirname(filespec))
+        File.open(filespec, 'w') { |f| f.puts html }
+      end
+    rescue Exception => eMsg
+      puts eMsg.to_s
+    end
+  end
+end

data/lib/imdb/imdb_search.rb ADDED Viewed

@@ -0,0 +1,111 @@
+class ImdbSearch
+  attr_reader :query
+  def initialize(query, search_akas=false)
+    @query = query
+    @search_akas = search_akas
+  end
+  def movies
+    @movies ||= parse_movies_from_document
+  end
+  # Find the IMDB ID for the current search title
+  # The find can be helped a lot by including a years option that contains
+  # an Array of integers that are the production year (plus/minus a year)
+  # and the release year.
+  def find_id(options={})
+    id = nil
+    found_movies = self.movies
+    unless found_movies.nil?
+      desired_movies = found_movies.select do |m|
+        aka = m.also_known_as
+        result = imdb_compare_titles(m.title, aka, @query) && !m.video_game? && !m.release_year.nil?
+        if result
+          unless options[:years].nil?
+            result = options[:years].include?(m.release_year.to_i)
+          end
+        end
+        result
+      end
+      ids = desired_movies.collect{|m| m.id}.uniq.compact
+      if ids.length == 1
+        id = "tt#{ids[0]}"
+      end
+    end
+    id
+  end
+  protected
+  # compare the imdb title and the imdb title's AKAs against the media title.
+  # note, on exact match lookups, IMDB will sometimes set the title to
+  # 'trailers and videos' instead of the correct title.
+  def imdb_compare_titles(imdb_title, aka_titles, media_title)
+    result = fuzzy_compare_titles(imdb_title, media_title)
+    unless result
+      result = fuzzy_compare_titles(imdb_title, 'trailers and videos')
+      unless result
+        aka_titles.each do |aka|
+          result = fuzzy_compare_titles(aka, media_title)
+          break if result
+        end
+      end
+    end
+    result
+  end
+  # a fuzzy compare that is case insensitive and replaces '&' with 'and'
+  # (because that is what IMDB occasionally does)
+  def fuzzy_compare_titles(title1, title2)
+    t1 = title1.downcase
+    t2 = title2.downcase
+    (t1 == t2) ||
+    (t1.gsub(/&/, 'and') == t2.gsub(/&/, 'and')) ||
+    (t1.gsub(/[-:]/, ' ') == t2.gsub(/[-:]/, ' ')) ||
+    (t1.gsub('more at imdbpro ?', '') == t2)
+  end
+  private
+  def document
+    filespec = "http://www.imdb.com/find?q=#{CGI::escape(@query)};s=tt"
+    @document ||= Hpricot(open(filespec).read)
+  end
+  def parse_movies_from_document
+    exact_match? ? parse_exact_match_search_results : parse_multi_movie_search_results
+  end
+  def parse_exact_match_search_results
+    id = document.at("a[@name='poster']")['href'][/\d+$/]
+    title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
+    [ImdbMovie.new(id, title)]
+  end
+  def parse_multi_movie_search_results
+    ids_and_titles = document.search('a[@href^="/title/tt"]').reject do |element|
+      element.innerHTML.strip_tags.empty?
+    end.map do |element|
+      [element['href'][/\d+/], element.innerHTML.strip_tags.unescape_html]
+    end.uniq
+    films = ids_and_titles.map do |id_and_title|
+      ImdbMovie.new(id_and_title[0], id_and_title[1])
+    end.uniq
+    if films.length > 1 && @search_akas
+      films = films.select do |m|
+        aka = m.also_known_as
+        imdb_compare_titles(m.title, aka, @query) && !m.video_game?
+      end
+    end
+    films
+  end
+  def exact_match?
+    document.search("title[text()='IMDb Title Search']").empty? && !document.search("a[@name='poster']").empty?
+  end
+end

data/lib/object_extensions.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# == Synopsis
+# add a blank? method to all Objects
+class Object
+  # return asserted if object is nil or empty
+  # TODO: not the safest coding, probably should dup before stripping.  Maybe should also compact
+  def blank?
+    result = nil?
+    unless result
+      if respond_to? 'empty?'
+        if respond_to? 'strip'
+          result = strip.empty?
+        else
+          result = empty?
+        end
+      end
+    end
+    result
+  end
+end

data/lib/string_extensions.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'cgi'
+require 'iconv'
+module ImdbStringExtensions
+  def unescape_html
+    Iconv.conv("UTF-8", 'ISO-8859-1', CGI::unescapeHTML(self))
+  end
+  def strip_tags
+    gsub(/<\/?[^>]*>/, "")
+  end
+end
+String.send :include, ImdbStringExtensions

metadata ADDED Viewed

@@ -0,0 +1,79 @@
+--- !ruby/object:Gem::Specification
+name: royw-imdb
+version: !ruby/object:Gem::Version
+  version: 0.0.8
+platform: ruby
+authors:
+- Sergio Gil
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-03-21 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.6"
+    version:
+- !ruby/object:Gem::Dependency
+  name: chronic
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description:
+email: sgilperez@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/imdb/imdb_movie.rb
+- lib/imdb/imdb_search.rb
+- lib/imdb/imdb_image.rb
+- lib/imdb.rb
+- lib/file_extensions.rb
+- lib/object_extensions.rb
+- lib/string_extensions.rb
+- README
+has_rdoc: false
+homepage:
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: Internet Movie DataBase
+test_files: []