RubyGems - royw-imdb - Versions diffs - 0.0.8 - Mend

royw-imdb 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README ADDED Viewed

@@ -0,0 +1,77 @@
+ImdbMovie Indiana Jones and the Last Crusade
+- should query IMDB url
+- should get the title
+- should get director(s)
+- should get the poster url
+- should return an ImdbImage object
+- should get the rating
+- should get cast members
+- should get the writers
+- should get the year
+- should get the release date
+- should get the genres
+- should get the plot
+- should get the length
+- should get the countries
+- should get the languages
+- should get the color
+- should get the company
+- should get some photos
+- should get the tagline
+- should get the aspect ratio
+ImdbMovie Han robado una estrella
+- should query IMDB url
+- should get the title
+- should get director(s)
+- should not get the poster
+- should get cast members
+- should get the writers
+- should get the release date
+- should get the genres
+- should not get the plot
+- should get the length
+- should get the countries
+- should get the languages
+- should not get the color
+- should get the company
+- should not get any photos
+ImdbSearch search that returns multiple movies
+- should query IMDB url
+ImdbSearch search that returns multiple movies movies
+- should be a collection of ImdbMovie instances
+- should include 'Indiana Jones and the Last Crusade'
+- should not have titles with HTML tags
+- should not have duplicate movies
+ImdbSearch search that redirects to the lone matching movie movies
+- should be a collection containing a single ImdbMovie instance
+- should have the correct ID
+- should have the correct title
+ImdbSearch searches that match on AKA title movies
+- should have multiple movies
+- should find id tt0127357
+- should have only one movie from 1998
+ImdbSearch searches that match on AKA title but without search_aka enabled movies
+- should have multiple movies
+- should have 2 movies from 1998
+ImdbMovie Indiana Jones and the Last Crusade
+- should query IMDB url
+- should get the image
+String unescape_html
+- should convert &amp; to &
+- should convert &#243; to ó
+String strip_tags
+- should strip HTML tags
+Finished in 5.222681 seconds
+53 examples, 0 failures

data/lib/file_extensions.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# == Synopsis
+# add a mkdirs method to the File class
+class File
+  ##
+  # make directories including any missing in the path
+  #
+  # @param [String] dirspec the path to make sure exists
+  def File.mkdirs(dirspec)
+    unless File.exists?(dirspec)
+      mkdirs(File.dirname(dirspec))
+      Dir.mkdir(dirspec)
+    end
+  end
+end

data/lib/imdb.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'open-uri'
+require 'date'
+require 'cgi'
+require 'rubygems'
+require 'hpricot'
+require 'chronic'
+require File.dirname(__FILE__) + '/imdb/imdb_search'
+require File.dirname(__FILE__) + '/imdb/imdb_movie'
+require File.dirname(__FILE__) + '/imdb/imdb_image'
+require File.dirname(__FILE__) + '/string_extensions'
+require File.dirname(__FILE__) + '/file_extensions'
+require File.dirname(__FILE__) + '/object_extensions'

data/lib/imdb/imdb_image.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# @imdb_movie.poster.should == 'http://ia.media-imdb.com/images/M/MV5BMTkzODA5ODYwOV5BMl5BanBnXkFtZTcwMjAyNDYyMQ@@._V1._SX216_SY316_.jpg'
+class ImdbImage
+  attr_accessor :url
+  def initialize(url)
+    @url = File.join("http://www.imdb.com/", url)
+  end
+  def image
+    document.at("table#principal tr td img")['src'] rescue nil
+  end
+  def document
+    @document ||= Hpricot(open(self.url).read)
+  end
+end

data/lib/imdb/imdb_movie.rb ADDED Viewed

@@ -0,0 +1,224 @@
+class ImdbMovie
+  include Comparable
+  attr_reader :id, :url#, :title
+  def initialize(id, title = nil)
+    @id = id
+#     @url = "http://www.imdb.com/title/tt#{@id}/"
+    @url = sprintf(ImdbMovie::url_format, @id.to_s)
+    @title = title
+  end
+  # this is intended to be stubed by rspec where it
+  # should return the path to the cached html file
+  # Note, the returned String should have one '%s'
+  # which will replaced by sprintf with @id.to_s
+  def self.url_format
+    'http://www.imdb.com/title/tt%s/'
+  end
+  # this is intended to be stubed by rspec where it
+  # should return true.
+  def self.use_html_cache
+    false
+  end
+  # add comparator so Arrays containing ImdbMovie objects
+  # can use uniq()
+  def <=>(other)
+    @id <=> other.id
+  end
+  def title
+    if @title.nil?
+      @title = document.at("div#tn15title h1").innerHTML.split('<span>').first.strip.unescape_html rescue nil
+    end
+    @title
+  end
+  def directors
+    document.search("h5[text()^='Director'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
+  end
+  def poster_url
+    document.at("a[@name='poster']")['href'] rescue nil
+  end
+  def tiny_poster_url
+    document.at("a[@name='poster'] img")['src'] rescue nil
+  end
+  def poster
+    ImdbImage.new(poster_url) rescue nil
+  end
+  def rating
+    document.at("h5[text()='User Rating:'] ~ b").innerHTML.strip.unescape_html.split('/').first.to_f rescue nil
+  end
+  def cast_members
+    # document.search("table.cast td.nm a").map { |link| link.innerHTML.strip.unescape_html } rescue []
+    document.search("table.cast tr").inject([]) do |result, row|
+      a = row.search("td.nm a").innerHTML.strip.unescape_html
+      c = row.search("td.char a").innerHTML.strip.unescape_html
+      if c.empty?
+        c = row.search("td.char").innerHTML.strip.unescape_html
+      end
+      result << [a,c]
+    end
+  end
+  def writers
+    document.search("h5[text()^='Writer'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
+  end
+  def year
+    document.search('a[@href^="/Sections/Years/"]').innerHTML
+  end
+  def release_date
+    date = document.search("//h5[text()^='Release Date']/..").innerHTML[/^\d{1,2} \w+ \d{4}/]
+    Date.parse(Chronic.parse(date).strftime('%Y/%m/%d'))
+  rescue
+    nil
+  end
+  def genres
+    document.search("h5[text()='Genre:'] ~ a[@href*=/Sections/Genres/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def plot
+    document.search("//h5[text()^='Plot']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def tagline
+    document.search("//h5[text()^='Tagline']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def aspect_ratio
+    document.search("//h5[text()^='Aspect Ratio']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
+  end
+  def length
+    document.search("//h5[text()^='Runtime']/..").innerHTML[/\d+ min/] rescue nil
+  end
+  def countries
+    document.search("h5[text()='Country:'] ~ a[@href*=/Sections/Countries/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def languages
+    document.search("h5[text()='Language:'] ~ a[@href*=/Sections/Languages/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
+  end
+  def color
+    document.at("h5[text()='Color:'] ~ a[@href*=color-info']").innerHTML.strip.unescape_html rescue nil
+  end
+  def company
+    document.at("h5[text()='Company:'] ~ a[@href*=/company/']").innerHTML.strip.unescape_html rescue nil
+  end
+  def photos
+    document.search(".media_strip_thumb img").map { |img| img['src'] } rescue []
+  end
+#   def get_data
+#     update_title
+#   end
+#   def title2
+#     document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
+#   end
+  # return the raw title
+  def raw_title
+    document.at("h1").innerText
+  end
+  # is this a video game as indicated by a '(VG)' in the raw title?
+  def video_game?
+    raw_title =~ /\(VG\)/
+  end
+  # find the release year
+  # Note, this is needed because not all entries on IMDB have a full
+  # release date as parsed by release_date.
+  def release_year
+    document.search("//h5[text()^='Release Date']/..").innerHTML[/\d{4}/]
+  end
+  # return an Array of Strings containing AKA titles
+  def also_known_as
+    el = document.search("//h5[text()^='Also Known As:']/..").at('h5')
+    aka = []
+    while(!el.nil?)
+      aka << el.to_s unless el.elem?
+      el = el.next
+    end
+    aka.collect!{|a| a.gsub(/\([^\)]*\)/, '').strip}
+    aka.uniq!
+    aka.compact!
+    aka.select{|a| !a.empty?}
+  end
+  # The MPAA rating, i.e. "PG-13"
+  def mpaa
+    document.search("//h5[text()^='MPAA']/..").text.gsub('MPAA:', '').strip rescue nil
+  end
+  # older films may not have MPAA ratings but usually have a certification.
+  # return a hash with country abbreviations for keys and the certification string for the value
+  # example:  {'USA' => 'Approved'}
+  def certifications
+    cert_hash = {}
+    certs = document.search("h5[text()='Certification:'] ~ a[@href*=/List?certificates']").map { |link| link.innerHTML.strip } rescue []
+    certs.each { |line| cert_hash[$1] = $2 if line =~ /(.*):(.*)/ }
+    cert_hash
+  end
+  private
+#   def update_title
+#     @title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
+#     #document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
+#   end
+  MAX_ATTEMPTS = 3
+  SECONDS_BETWEEN_RETRIES = 1.0
+  # Fetch the document with retry to handle the occasional glitches
+  def document
+    attempts = 0
+    begin
+      html = open(self.url).read
+      @document ||= Hpricot(html)
+      cache_html_files(html) if ImdbMovie::use_html_cache
+    rescue Exception => e
+      attempts += 1
+      if attempts > MAX_ATTEMPTS
+        raise
+      else
+        sleep SECONDS_BETWEEN_RETRIES
+        retry
+      end
+    end
+    @document
+  end
+  # this is used to save imdb pages so they may be used by rspec
+  def cache_html_files(html)
+    begin
+      filespec = self.url.gsub(/^http:\//, 'spec/samples').gsub(/\/$/, '.html')
+      unless File.exist?(filespec)
+        puts filespec
+        File.mkdirs(File.dirname(filespec))
+        File.open(filespec, 'w') { |f| f.puts html }
+      end
+    rescue Exception => eMsg
+      puts eMsg.to_s
+    end
+  end
+end

data/lib/imdb/imdb_search.rb ADDED Viewed

@@ -0,0 +1,111 @@
+class ImdbSearch
+  attr_reader :query
+  def initialize(query, search_akas=false)
+    @query = query
+    @search_akas = search_akas
+  end
+  def movies
+    @movies ||= parse_movies_from_document
+  end
+  # Find the IMDB ID for the current search title
+  # The find can be helped a lot by including a years option that contains
+  # an Array of integers that are the production year (plus/minus a year)
+  # and the release year.
+  def find_id(options={})
+    id = nil
+    found_movies = self.movies
+    unless found_movies.nil?
+      desired_movies = found_movies.select do |m|
+        aka = m.also_known_as
+        result = imdb_compare_titles(m.title, aka, @query) && !m.video_game? && !m.release_year.nil?
+        if result
+          unless options[:years].nil?
+            result = options[:years].include?(m.release_year.to_i)
+          end
+        end
+        result
+      end
+      ids = desired_movies.collect{|m| m.id}.uniq.compact
+      if ids.length == 1
+        id = "tt#{ids[0]}"
+      end
+    end
+    id
+  end
+  protected
+  # compare the imdb title and the imdb title's AKAs against the media title.
+  # note, on exact match lookups, IMDB will sometimes set the title to
+  # 'trailers and videos' instead of the correct title.
+  def imdb_compare_titles(imdb_title, aka_titles, media_title)
+    result = fuzzy_compare_titles(imdb_title, media_title)
+    unless result
+      result = fuzzy_compare_titles(imdb_title, 'trailers and videos')
+      unless result
+        aka_titles.each do |aka|
+          result = fuzzy_compare_titles(aka, media_title)
+          break if result
+        end
+      end
+    end
+    result
+  end
+  # a fuzzy compare that is case insensitive and replaces '&' with 'and'
+  # (because that is what IMDB occasionally does)
+  def fuzzy_compare_titles(title1, title2)
+    t1 = title1.downcase
+    t2 = title2.downcase
+    (t1 == t2) ||
+    (t1.gsub(/&/, 'and') == t2.gsub(/&/, 'and')) ||
+    (t1.gsub(/[-:]/, ' ') == t2.gsub(/[-:]/, ' ')) ||
+    (t1.gsub('more at imdbpro ?', '') == t2)
+  end
+  private
+  def document
+    filespec = "http://www.imdb.com/find?q=#{CGI::escape(@query)};s=tt"
+    @document ||= Hpricot(open(filespec).read)
+  end
+  def parse_movies_from_document
+    exact_match? ? parse_exact_match_search_results : parse_multi_movie_search_results
+  end
+  def parse_exact_match_search_results
+    id = document.at("a[@name='poster']")['href'][/\d+$/]
+    title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
+    [ImdbMovie.new(id, title)]
+  end
+  def parse_multi_movie_search_results
+    ids_and_titles = document.search('a[@href^="/title/tt"]').reject do |element|
+      element.innerHTML.strip_tags.empty?
+    end.map do |element|
+      [element['href'][/\d+/], element.innerHTML.strip_tags.unescape_html]
+    end.uniq
+    films = ids_and_titles.map do |id_and_title|
+      ImdbMovie.new(id_and_title[0], id_and_title[1])
+    end.uniq
+    if films.length > 1 && @search_akas
+      films = films.select do |m|
+        aka = m.also_known_as
+        imdb_compare_titles(m.title, aka, @query) && !m.video_game?
+      end
+    end
+    films
+  end
+  def exact_match?
+    document.search("title[text()='IMDb Title Search']").empty? && !document.search("a[@name='poster']").empty?
+  end
+end

data/lib/object_extensions.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# == Synopsis
+# add a blank? method to all Objects
+class Object
+  # return asserted if object is nil or empty
+  # TODO: not the safest coding, probably should dup before stripping.  Maybe should also compact
+  def blank?
+    result = nil?
+    unless result
+      if respond_to? 'empty?'
+        if respond_to? 'strip'
+          result = strip.empty?
+        else
+          result = empty?
+        end
+      end
+    end
+    result
+  end
+end

data/lib/string_extensions.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'cgi'
+require 'iconv'
+module ImdbStringExtensions
+  def unescape_html
+    Iconv.conv("UTF-8", 'ISO-8859-1', CGI::unescapeHTML(self))
+  end
+  def strip_tags
+    gsub(/<\/?[^>]*>/, "")
+  end
+end
+String.send :include, ImdbStringExtensions

metadata ADDED Viewed

@@ -0,0 +1,79 @@
+--- !ruby/object:Gem::Specification
+name: royw-imdb
+version: !ruby/object:Gem::Version
+  version: 0.0.8
+platform: ruby
+authors:
+- Sergio Gil
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-03-21 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.6"
+    version:
+- !ruby/object:Gem::Dependency
+  name: chronic
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description:
+email: sgilperez@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/imdb/imdb_movie.rb
+- lib/imdb/imdb_search.rb
+- lib/imdb/imdb_image.rb
+- lib/imdb.rb
+- lib/file_extensions.rb
+- lib/object_extensions.rb
+- lib/string_extensions.rb
+- README
+has_rdoc: false
+homepage:
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: Internet Movie DataBase
+test_files: []