RubyGems - yayimdbs - Versions diffs - 0.1.10 → 0.2.0 - Mend

yayimdbs 0.1.10 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/README.md CHANGED Viewed

@@ -4,7 +4,7 @@ Overview
 --------
 Yet Another Ying IMDB Scraper
-This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project.  I have moved it out into it's own gem so i can share it across projects.
+This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project.  I have moved it out into it's own gem so I can share it across projects.
 Features
 --------
@@ -49,4 +49,4 @@ MIT
 Contact
 -------
-Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
+Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)

data/lib/yay_imdbs.rb CHANGED Viewed

@@ -18,220 +18,218 @@ class YayImdbs
   STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
-  def self.search_for_imdb_id(name, year=nil, type=nil)
-    search_results = self.search_imdb(name)
-    return nil if search_results.empty?
-    search_results.each do |result|
-      # Ensure result is the correct video type
-      next if type && (result[:video_type] != type)
-      # If no year provided just return first result
-      return result[:imdb_id] if !year || result[:year] == year
-    end
-    return nil
-  end
-  def self.search_imdb(search_term)
-    search_results = []
-    doc = self.get_search_page(search_term)
-    # If the search is an exact match imdb will redirect to the movie page not search results page
-    # we uses the the title meta element to determine if we got an exact match
-    movie_title, movie_year = get_title_and_year_from_meta(doc)
-    if movie_title
-      canonical_link = doc.xpath("//link[@rel='canonical']")
-      if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
-        return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
-      else
-        raise "Unable to extract imdb id from exact search result"
+  DATE_PROPERTIES = [:release_date]
+  LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
+  INT_LIST_PROPERTIES = [:year, :season]
+  PROPERTY_ALIAS  = {:genres => :genre,
+                     :taglines => :tagline,
+                     :year => :years,
+                     :season => :seasons,
+                     :language => :languages,
+                     :motion_picture_rating_mpaa => :mpaa}
+  class << self
+    def search_for_imdb_id(name, year=nil, type=nil)
+      search_results = self.search_imdb(name)
+      search_results.each do |result|
+        # Ensure result is the correct video type
+        next if type && (result[:video_type] != type)
+        # If no year provided just return first result
+        return result[:imdb_id] if year.nil? || result[:year] == year
       end
+      return nil
     end
-    doc.xpath("//td").each do |td|
-      td.xpath(".//a").each do |link|
-        href = link['href']
-        current_name = link.content
-        # Ignore links with no text (e.g. image links)
-        next unless current_name.present?
-        current_name = self.clean_title(current_name)
-        if href =~ /^\/title\/tt(\d+)/
+    def search_imdb(search_term)
+      search_results = []
+      doc = self.get_search_page(search_term)
+      # If the search is an exact match imdb will redirect to the movie page not search results page
+      # we uses the title meta element to determine if we got an exact match
+      movie_title, movie_year = get_title_and_year_from_meta(doc)
+      if movie_title
+        canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
+        if canonical_link && canonical_link =~ /tt(\d+)\//
+          return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
+        else
+          raise "Unable to extract imdb id from exact search result"
+        end
+      end
+      doc.css("td").each do |td|
+        td.css("a").each do |link|
+          href = link['href']
+          current_name = link.content
+          # Ignore links with no text (e.g. image links) or links that don't link to movie pages
+          next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
           imdb_id = $1
           current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
-          search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
+          search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
         end
       end
-    end
-    return search_results
-  end
-  def self.scrap_movie_info(imdb_id)
-    info_hash = {:imdb_id => imdb_id}.with_indifferent_access
-    doc = self.get_movie_page(imdb_id)
-    info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
-    if info_hash['title'].nil?
-      #If we cant get title and year something is wrong
-      raise "Unable to find title or year for imdb id #{imdb_id}"
-    end
-    info_hash['video_type'] = self.video_type_from_meta(doc)
-    info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
-    found_info_divs = false
-    doc.xpath("//div/h4").each do |h4|
-      div = h4.parent
-      found_info_divs = true
-      raw_key = h4.inner_text
-      key = raw_key.sub(':', '').strip.downcase
-      value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
-      value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)$/, '').strip
+      return search_results
+    end
+    def scrap_movie_info(imdb_id)
+      info_hash = {:imdb_id => imdb_id}.with_indifferent_access
+      doc = self.get_movie_page(imdb_id)
+      title, year = get_title_and_year_from_meta(doc)
+      info_hash[:title], info_hash[:year] = title, year
+      if info_hash['title'].nil?
+        #If we cant get title and year something is wrong
+        raise "Unable to find title or year for imdb id #{imdb_id}"
+      end
+      info_hash[:video_type] = self.video_type_from_meta(doc)
-      if key == 'release date'
-        begin
-          value = Date.strptime(value, '%d %B %Y')
-        rescue
-          p "Invalid date '#{value}' for imdb id: #{imdb_id}"
-          value = nil
-        end
-      elsif key == 'runtime'
+      info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
+      info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
+      found_info_divs = false
+      movie_properties(doc) do |key, value|
+        found_info_divs = true
+        info_hash["raw_#{key}"] = value
+        info_hash[key] = clean_movie_property(key, value)
+        info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
+      end
+      if not found_info_divs
+        #If we don't find any info divs assume parsing failed
+        raise "No info divs found for imdb id #{imdb_id}"
+      end
+      # Hack: tv shows can have a year property, which is a list, fixing ...
+      info_hash[:year] = year
+      self.scrap_images(doc, info_hash)
+      #scrap episodes if tv series
+      if info_hash.has_key?('season')
+        self.scrap_episodes(info_hash)
+      end
+      return info_hash
+    end
+    def clean_movie_property(key, value)
+      if DATE_PROPERTIES.include?(key)
+        value = Date.strptime(value, '%d %B %Y') rescue nil
+      elsif key == :runtime
         if value =~ /(\d+)\smin/
           value = $1.to_i
         else
-          p "Unexpected runtime format #{value} for movie #{imdb_id}"
+          value = nil
         end
-      elsif key == 'genres'
+      elsif LIST_PROPERTIES.include?(key)
         value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
-        # Backwards compatibility hack
-        info_hash[:genre] = value
-      elsif key == 'year'
+      elsif INT_LIST_PROPERTIES.include?(key)
         value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
-        # TV shows can have multiple years
-        info_hash[:years] = value
-        value = value.sort.first
-      elsif key == 'language'
-        value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
-      elsif key == 'taglines'
-        # Backwards compatibility
-        info_hash['tagline'] = value
-      elsif key == 'motion picture rating (mpaa)'
-        value = value.gsub(/See all certifications/, '').strip
-        # Backwards compatibility FIXME do with a map
-        info_hash['mpaa'] = value
       end
-      info_hash[key.downcase.gsub(/\s/, '_')] = value
-    end
-    if not found_info_divs
-      #If we don't find any info divs assume parsing failed
-      raise "No info divs found for imdb id #{imdb_id}"
-    end
-    self.scrap_images(doc, info_hash)
-    #scrap episodes if tv series
-    if info_hash.has_key?('season')
-      self.scrap_episodes(info_hash)
+      return value
     end
-    return info_hash
-  end
-  private
-   def self.scrap_images(doc, info_hash)
-    #scrap poster image urls
-    thumb = doc.xpath("//td[@id = 'img_primary']/a/img")
-    if thumb.first
-      thumbnail_url = thumb.first['src']
-      if not thumbnail_url =~ /\/nopicture\//
-        info_hash['medium_image'] = thumbnail_url
-        # Small thumbnail image, gotten by hacking medium url
-        info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
-        #Try to scrap a larger version of the image url
-        large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
-        large_img_doc = self.get_media_page(large_img_page)
-        large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
-        info_hash['large_image'] = large_img_url
+    def movie_properties(doc)
+      doc.css("div h4").each do |h4|
+        div = h4.parent
+        raw_key = h4.inner_text
+        key = raw_key.sub(':', '').strip.downcase
+        value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
+        value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
+        symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
+        yield symbol_key, value
       end
     end
-   end
-   def self.scrap_episodes(info_hash)
+    def scrap_images(doc, info_hash)
+      #scrap poster image urls
+      thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
+      return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
+      info_hash['medium_image'] = thumbnail_url
+      # Small thumbnail image, gotten by hacking medium url
+      info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
+      #Try to scrap a larger version of the image url
+      large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
+      return unless large_img_page_link
+      large_img_doc = get_media_page(large_img_page_link)
+      large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
+      info_hash['large_image'] = large_img_url
+    end
+    def scrap_episodes(info_hash)
       episodes = []
       doc = self.get_episodes_page(info_hash[:imdb_id])
-      episode_divs = doc.css(".filter-all")
-      episode_divs.each do |e_div|
-        if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
+      doc.css(".filter-all").each do |e_div|
+        next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
           episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
-          raw_date = e_div.xpath('.//span/strong').inner_text.strip
-          episode['date'] = Date.parse(raw_date)
-          if e_div.inner_text =~ /#{raw_date}/
-            episode['plot'] = $'.strip
-          end
-          episodes << episode
+        raw_date = e_div.at_css('strong').inner_text.strip
+        episode['date'] = Date.parse(raw_date) rescue nil
+        if e_div.inner_text =~ /#{raw_date}/
+          episode['plot'] = $'.strip
         end
+        episodes << episode
       end
       info_hash['episodes'] = episodes
-   end
-    def self.get_search_page(name)
-      Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
     end
-    def self.get_movie_page(imdb_id)
-      Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
-    end
+      def get_search_page(name)
+        Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
+      end
-    def self.get_episodes_page(imdb_id)
-      Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
-    end
+      def get_movie_page(imdb_id)
+        Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
+      end
-    def self.get_media_page(url_fragment)
-      Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
-     end
+      def get_episodes_page(imdb_id)
+        Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
+      end
+      def get_media_page(url_fragment)
+        Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
+       end
-    def self.get_title_and_year_from_meta(doc)
-      return nil, nil unless doc.xpath("//meta[@name='title']").first
+      def get_title_and_year_from_meta(doc)
+        title_text = doc.at_css("meta[name='title']").try(:[], 'content')
+        # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
+        if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
+          movie_title = self.clean_title($1)
+          movie_year = $2.to_i
+        end
+        return movie_title, movie_year
+      end
+      # Remove surrounding double quotes that seems to appear on tv show name
+      def clean_title(movie_title)
+        movie_title = $1 if movie_title =~ /^"(.*)"$/
+        return movie_title.strip
+      end
-      title_text = doc.xpath("//meta[@name='title']").first['content']
-      # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
-      if title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
-        movie_title = $1
-        movie_year = $2.to_i
-        movie_title = self.clean_title(movie_title)
+      # Hackyness to get around ruby 1.9 encoding issue
+      def strip_whitespace(s)
+        s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
+      end
+      def video_type(td)
+        return :tv_show if td.content =~ /\((TV series|TV)\)/
+        return :movie
+      end
+      def video_type_from_meta(doc)
+        type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
+        type_text == 'tv_show' ? :tv_show : :movie
       end
-      return movie_title, movie_year
-    end
-    # Remove surrounding double quotes that seems to appear on tv show name
-    def self.clean_title(movie_title)
-      movie_title = $1 if movie_title =~ /^"(.*)"$/
-      return movie_title.strip
-    end
-    # Hackyness to get around ruby 1.9 encoding issue
-    def self.strip_whitespace(s)
-      s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
-    end
-    def self.video_type(td)
-      return :tv_show if td.content =~ /\((TV series|TV)\)/
-      return :movie
-    end
-    def self.video_type_from_meta(doc)
-      meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
-      return :movie unless meta_type_tag.first
-      type_text = meta_type_tag.first['content']
-      case type_text
-        when 'tv_show' then return :tv_show
-        else return :movie
-      end
     end
 end

metadata CHANGED Viewed

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 1
-  - 10
-  version: 0.1.10
+  - 2
+  - 0
+  version: 0.2.0
 platform: ruby
 authors:
 - Sam Cavenagh
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-02-12 00:00:00 +11:00
+date: 2011-03-06 00:00:00 +11:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency