yayimdbs 0.1.10 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
 - data/lib/yay_imdbs.rb +180 -182
 - metadata +4 -4
 
    
        data/README.md
    CHANGED
    
    | 
         @@ -4,7 +4,7 @@ Overview 
     | 
|
| 
       4 
4 
     | 
    
         
             
            --------
         
     | 
| 
       5 
5 
     | 
    
         
             
            Yet Another Ying IMDB Scraper
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
            This is a simple imdb scraper, that  
     | 
| 
      
 7 
     | 
    
         
            +
            This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project.  I have moved it out into it's own gem so I can share it across projects.
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
       9 
9 
     | 
    
         
             
            Features
         
     | 
| 
       10 
10 
     | 
    
         
             
            --------
         
     | 
| 
         @@ -49,4 +49,4 @@ MIT 
     | 
|
| 
       49 
49 
     | 
    
         | 
| 
       50 
50 
     | 
    
         
             
            Contact
         
     | 
| 
       51 
51 
     | 
    
         
             
            -------
         
     | 
| 
       52 
     | 
    
         
            -
            Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
         
     | 
| 
      
 52 
     | 
    
         
            +
            Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
         
     | 
    
        data/lib/yay_imdbs.rb
    CHANGED
    
    | 
         @@ -18,220 +18,218 @@ class YayImdbs 
     | 
|
| 
       18 
18 
     | 
    
         | 
| 
       19 
19 
     | 
    
         
             
              STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
     | 
    
         
            -
               
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
              
         
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
                if movie_title
         
     | 
| 
       43 
     | 
    
         
            -
                  canonical_link = doc.xpath("//link[@rel='canonical']")
         
     | 
| 
       44 
     | 
    
         
            -
                  if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
         
     | 
| 
       45 
     | 
    
         
            -
                    return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
         
     | 
| 
       46 
     | 
    
         
            -
                  else
         
     | 
| 
       47 
     | 
    
         
            -
                    raise "Unable to extract imdb id from exact search result"
         
     | 
| 
      
 21 
     | 
    
         
            +
              DATE_PROPERTIES = [:release_date]
         
     | 
| 
      
 22 
     | 
    
         
            +
              LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
         
     | 
| 
      
 23 
     | 
    
         
            +
              INT_LIST_PROPERTIES = [:year, :season]
         
     | 
| 
      
 24 
     | 
    
         
            +
              PROPERTY_ALIAS  = {:genres => :genre, 
         
     | 
| 
      
 25 
     | 
    
         
            +
                                 :taglines => :tagline, 
         
     | 
| 
      
 26 
     | 
    
         
            +
                                 :year => :years, 
         
     | 
| 
      
 27 
     | 
    
         
            +
                                 :season => :seasons,
         
     | 
| 
      
 28 
     | 
    
         
            +
                                 :language => :languages,
         
     | 
| 
      
 29 
     | 
    
         
            +
                                 :motion_picture_rating_mpaa => :mpaa}
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
              class << self
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                def search_for_imdb_id(name, year=nil, type=nil)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  search_results = self.search_imdb(name)
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                  search_results.each do |result|
         
     | 
| 
      
 37 
     | 
    
         
            +
                    # Ensure result is the correct video type
         
     | 
| 
      
 38 
     | 
    
         
            +
                    next if type && (result[:video_type] != type)
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                    # If no year provided just return first result
         
     | 
| 
      
 41 
     | 
    
         
            +
                    return result[:imdb_id] if year.nil? || result[:year] == year
         
     | 
| 
       48 
42 
     | 
    
         
             
                  end
         
     | 
| 
      
 43 
     | 
    
         
            +
                  return nil
         
     | 
| 
       49 
44 
     | 
    
         
             
                end
         
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
                 
     | 
| 
       52 
     | 
    
         
            -
                   
     | 
| 
       53 
     | 
    
         
            -
             
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
             
     | 
| 
       57 
     | 
    
         
            -
             
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
       59 
     | 
    
         
            -
                  
         
     | 
| 
       60 
     | 
    
         
            -
                     
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                def search_imdb(search_term)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  search_results = []
         
     | 
| 
      
 48 
     | 
    
         
            +
                
         
     | 
| 
      
 49 
     | 
    
         
            +
                  doc = self.get_search_page(search_term)
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                  # If the search is an exact match imdb will redirect to the movie page not search results page
         
     | 
| 
      
 52 
     | 
    
         
            +
                  # we uses the title meta element to determine if we got an exact match
         
     | 
| 
      
 53 
     | 
    
         
            +
                  movie_title, movie_year = get_title_and_year_from_meta(doc)
         
     | 
| 
      
 54 
     | 
    
         
            +
                  if movie_title
         
     | 
| 
      
 55 
     | 
    
         
            +
                    canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
         
     | 
| 
      
 56 
     | 
    
         
            +
                    if canonical_link && canonical_link =~ /tt(\d+)\//
         
     | 
| 
      
 57 
     | 
    
         
            +
                      return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
         
     | 
| 
      
 58 
     | 
    
         
            +
                    else
         
     | 
| 
      
 59 
     | 
    
         
            +
                      raise "Unable to extract imdb id from exact search result"
         
     | 
| 
      
 60 
     | 
    
         
            +
                    end
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
                  doc.css("td").each do |td| 
         
     | 
| 
      
 64 
     | 
    
         
            +
                    td.css("a").each do |link|
         
     | 
| 
      
 65 
     | 
    
         
            +
                      href = link['href']
         
     | 
| 
      
 66 
     | 
    
         
            +
                      current_name = link.content
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                      # Ignore links with no text (e.g. image links) or links that don't link to movie pages
         
     | 
| 
      
 69 
     | 
    
         
            +
                      next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
         
     | 
| 
       61 
70 
     | 
    
         
             
                      imdb_id = $1
         
     | 
| 
       62 
71 
     | 
    
         
             
                      current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
         
     | 
| 
       63 
     | 
    
         
            -
                      search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type =>  
     | 
| 
      
 72 
     | 
    
         
            +
                      search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
         
     | 
| 
       64 
73 
     | 
    
         
             
                    end
         
     | 
| 
       65 
74 
     | 
    
         
             
                  end
         
     | 
| 
       66 
     | 
    
         
            -
                end
         
     | 
| 
       67 
     | 
    
         
            -
              
         
     | 
| 
       68 
     | 
    
         
            -
                return search_results
         
     | 
| 
       69 
     | 
    
         
            -
              end  
         
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
              def self.scrap_movie_info(imdb_id)
         
     | 
| 
       72 
     | 
    
         
            -
                info_hash = {:imdb_id => imdb_id}.with_indifferent_access
         
     | 
| 
       73 
     | 
    
         
            -
              
         
     | 
| 
       74 
     | 
    
         
            -
                doc = self.get_movie_page(imdb_id)
         
     | 
| 
       75 
     | 
    
         
            -
                info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
         
     | 
| 
       76 
     | 
    
         
            -
                if info_hash['title'].nil?
         
     | 
| 
       77 
     | 
    
         
            -
                  #If we cant get title and year something is wrong
         
     | 
| 
       78 
     | 
    
         
            -
                  raise "Unable to find title or year for imdb id #{imdb_id}"
         
     | 
| 
       79 
     | 
    
         
            -
                end
         
     | 
| 
       80 
     | 
    
         
            -
                info_hash['video_type'] = self.video_type_from_meta(doc)
         
     | 
| 
       81 
75 
     | 
    
         | 
| 
       82 
     | 
    
         
            -
             
     | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
       85 
     | 
    
         
            -
                 
     | 
| 
       86 
     | 
    
         
            -
                   
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
                   
     | 
| 
       89 
     | 
    
         
            -
                   
     | 
| 
       90 
     | 
    
         
            -
                   
     | 
| 
       91 
     | 
    
         
            -
                   
     | 
| 
      
 76 
     | 
    
         
            +
                  return search_results
         
     | 
| 
      
 77 
     | 
    
         
            +
                end  
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                def scrap_movie_info(imdb_id)
         
     | 
| 
      
 80 
     | 
    
         
            +
                  info_hash = {:imdb_id => imdb_id}.with_indifferent_access
         
     | 
| 
      
 81 
     | 
    
         
            +
                
         
     | 
| 
      
 82 
     | 
    
         
            +
                  doc = self.get_movie_page(imdb_id)
         
     | 
| 
      
 83 
     | 
    
         
            +
                  title, year = get_title_and_year_from_meta(doc)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  info_hash[:title], info_hash[:year] = title, year
         
     | 
| 
      
 85 
     | 
    
         
            +
                  if info_hash['title'].nil?
         
     | 
| 
      
 86 
     | 
    
         
            +
                    #If we cant get title and year something is wrong
         
     | 
| 
      
 87 
     | 
    
         
            +
                    raise "Unable to find title or year for imdb id #{imdb_id}"
         
     | 
| 
      
 88 
     | 
    
         
            +
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
                  info_hash[:video_type] = self.video_type_from_meta(doc)
         
     | 
| 
       92 
90 
     | 
    
         | 
| 
       93 
     | 
    
         
            -
                   
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
       96 
     | 
    
         
            -
             
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
       98 
     | 
    
         
            -
             
     | 
| 
       99 
     | 
    
         
            -
                     
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
      
 91 
     | 
    
         
            +
                  info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
         
     | 
| 
      
 92 
     | 
    
         
            +
                  info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                  found_info_divs = false
         
     | 
| 
      
 95 
     | 
    
         
            +
                  movie_properties(doc) do |key, value|
         
     | 
| 
      
 96 
     | 
    
         
            +
                    found_info_divs = true
         
     | 
| 
      
 97 
     | 
    
         
            +
                    info_hash["raw_#{key}"] = value
         
     | 
| 
      
 98 
     | 
    
         
            +
                    info_hash[key] = clean_movie_property(key, value)
         
     | 
| 
      
 99 
     | 
    
         
            +
                    info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
         
     | 
| 
      
 100 
     | 
    
         
            +
                  end
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                  if not found_info_divs
         
     | 
| 
      
 103 
     | 
    
         
            +
                    #If we don't find any info divs assume parsing failed
         
     | 
| 
      
 104 
     | 
    
         
            +
                    raise "No info divs found for imdb id #{imdb_id}"
         
     | 
| 
      
 105 
     | 
    
         
            +
                  end
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                  # Hack: tv shows can have a year property, which is a list, fixing ...
         
     | 
| 
      
 108 
     | 
    
         
            +
                  info_hash[:year] = year
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
                  self.scrap_images(doc, info_hash)
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                  #scrap episodes if tv series
         
     | 
| 
      
 113 
     | 
    
         
            +
                  if info_hash.has_key?('season')
         
     | 
| 
      
 114 
     | 
    
         
            +
                    self.scrap_episodes(info_hash)
         
     | 
| 
      
 115 
     | 
    
         
            +
                  end
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                  return info_hash
         
     | 
| 
      
 118 
     | 
    
         
            +
                end
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                def clean_movie_property(key, value)
         
     | 
| 
      
 121 
     | 
    
         
            +
                  if DATE_PROPERTIES.include?(key)
         
     | 
| 
      
 122 
     | 
    
         
            +
                    value = Date.strptime(value, '%d %B %Y') rescue nil
         
     | 
| 
      
 123 
     | 
    
         
            +
                  elsif key == :runtime
         
     | 
| 
       101 
124 
     | 
    
         
             
                    if value =~ /(\d+)\smin/
         
     | 
| 
       102 
125 
     | 
    
         
             
                      value = $1.to_i
         
     | 
| 
       103 
126 
     | 
    
         
             
                    else
         
     | 
| 
       104 
     | 
    
         
            -
                       
     | 
| 
      
 127 
     | 
    
         
            +
                      value = nil
         
     | 
| 
       105 
128 
     | 
    
         
             
                    end
         
     | 
| 
       106 
     | 
    
         
            -
                  elsif key 
     | 
| 
      
 129 
     | 
    
         
            +
                  elsif LIST_PROPERTIES.include?(key)
         
     | 
| 
       107 
130 
     | 
    
         
             
                    value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
         
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
                    info_hash[:genre] = value
         
     | 
| 
       110 
     | 
    
         
            -
                  elsif key == 'year'
         
     | 
| 
      
 131 
     | 
    
         
            +
                  elsif INT_LIST_PROPERTIES.include?(key)
         
     | 
| 
       111 
132 
     | 
    
         
             
                    value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
         
     | 
| 
       112 
     | 
    
         
            -
                    # TV shows can have multiple years
         
     | 
| 
       113 
     | 
    
         
            -
                    info_hash[:years] = value
         
     | 
| 
       114 
     | 
    
         
            -
                    value = value.sort.first
         
     | 
| 
       115 
     | 
    
         
            -
                  elsif key == 'language'
         
     | 
| 
       116 
     | 
    
         
            -
                    value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
         
     | 
| 
       117 
     | 
    
         
            -
                  elsif key == 'taglines'
         
     | 
| 
       118 
     | 
    
         
            -
                    # Backwards compatibility
         
     | 
| 
       119 
     | 
    
         
            -
                    info_hash['tagline'] = value
         
     | 
| 
       120 
     | 
    
         
            -
                  elsif key == 'motion picture rating (mpaa)'
         
     | 
| 
       121 
     | 
    
         
            -
                    value = value.gsub(/See all certifications/, '').strip
         
     | 
| 
       122 
     | 
    
         
            -
                    # Backwards compatibility FIXME do with a map
         
     | 
| 
       123 
     | 
    
         
            -
                    info_hash['mpaa'] = value
         
     | 
| 
       124 
133 
     | 
    
         
             
                  end
         
     | 
| 
       125 
     | 
    
         
            -
                   
     | 
| 
       126 
     | 
    
         
            -
                end
         
     | 
| 
       127 
     | 
    
         
            -
              
         
     | 
| 
       128 
     | 
    
         
            -
                if not found_info_divs
         
     | 
| 
       129 
     | 
    
         
            -
                  #If we don't find any info divs assume parsing failed
         
     | 
| 
       130 
     | 
    
         
            -
                  raise "No info divs found for imdb id #{imdb_id}"
         
     | 
| 
       131 
     | 
    
         
            -
                end
         
     | 
| 
       132 
     | 
    
         
            -
              
         
     | 
| 
       133 
     | 
    
         
            -
                self.scrap_images(doc, info_hash)
         
     | 
| 
       134 
     | 
    
         
            -
              
         
     | 
| 
       135 
     | 
    
         
            -
                #scrap episodes if tv series
         
     | 
| 
       136 
     | 
    
         
            -
                if info_hash.has_key?('season')
         
     | 
| 
       137 
     | 
    
         
            -
                  self.scrap_episodes(info_hash)
         
     | 
| 
      
 134 
     | 
    
         
            +
                  return value
         
     | 
| 
       138 
135 
     | 
    
         
             
                end
         
     | 
| 
       139 
     | 
    
         
            -
             
     | 
| 
       140 
     | 
    
         
            -
                 
     | 
| 
       141 
     | 
    
         
            -
             
     | 
| 
       142 
     | 
    
         
            -
             
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
       144 
     | 
    
         
            -
             
     | 
| 
       145 
     | 
    
         
            -
             
     | 
| 
       146 
     | 
    
         
            -
             
     | 
| 
       147 
     | 
    
         
            -
             
     | 
| 
       148 
     | 
    
         
            -
             
     | 
| 
       149 
     | 
    
         
            -
             
     | 
| 
       150 
     | 
    
         
            -
                     
     | 
| 
       151 
     | 
    
         
            -
             
     | 
| 
       152 
     | 
    
         
            -
                    # Small thumbnail image, gotten by hacking medium url
         
     | 
| 
       153 
     | 
    
         
            -
                    info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
         
     | 
| 
       154 
     | 
    
         
            -
                  
         
     | 
| 
       155 
     | 
    
         
            -
                    #Try to scrap a larger version of the image url
         
     | 
| 
       156 
     | 
    
         
            -
                    large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
         
     | 
| 
       157 
     | 
    
         
            -
                    large_img_doc = self.get_media_page(large_img_page) 
         
     | 
| 
       158 
     | 
    
         
            -
                    large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
         
     | 
| 
       159 
     | 
    
         
            -
                    info_hash['large_image'] = large_img_url
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
                def movie_properties(doc)
         
     | 
| 
      
 138 
     | 
    
         
            +
                  doc.css("div h4").each do |h4|
         
     | 
| 
      
 139 
     | 
    
         
            +
                    div = h4.parent
         
     | 
| 
      
 140 
     | 
    
         
            +
                    raw_key = h4.inner_text
         
     | 
| 
      
 141 
     | 
    
         
            +
                    key = raw_key.sub(':', '').strip.downcase
         
     | 
| 
      
 142 
     | 
    
         
            +
                    value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
         
     | 
| 
      
 143 
     | 
    
         
            +
                    value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                    symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                    yield symbol_key, value
         
     | 
| 
       160 
148 
     | 
    
         
             
                  end
         
     | 
| 
       161 
149 
     | 
    
         
             
                end
         
     | 
| 
       162 
     | 
    
         
            -
               end
         
     | 
| 
       163 
150 
     | 
    
         | 
| 
       164 
     | 
    
         
            -
             
     | 
| 
      
 151 
     | 
    
         
            +
                def scrap_images(doc, info_hash)
         
     | 
| 
      
 152 
     | 
    
         
            +
                  #scrap poster image urls
         
     | 
| 
      
 153 
     | 
    
         
            +
                  thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
         
     | 
| 
      
 154 
     | 
    
         
            +
                  return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                  info_hash['medium_image'] = thumbnail_url
         
     | 
| 
      
 157 
     | 
    
         
            +
                  # Small thumbnail image, gotten by hacking medium url
         
     | 
| 
      
 158 
     | 
    
         
            +
                  info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
         
     | 
| 
      
 159 
     | 
    
         
            +
             
     | 
| 
      
 160 
     | 
    
         
            +
                  #Try to scrap a larger version of the image url
         
     | 
| 
      
 161 
     | 
    
         
            +
                  large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
         
     | 
| 
      
 162 
     | 
    
         
            +
                  return unless large_img_page_link
         
     | 
| 
      
 163 
     | 
    
         
            +
                  large_img_doc = get_media_page(large_img_page_link) 
         
     | 
| 
      
 164 
     | 
    
         
            +
                  large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
         
     | 
| 
      
 165 
     | 
    
         
            +
                  info_hash['large_image'] = large_img_url
         
     | 
| 
      
 166 
     | 
    
         
            +
                end
         
     | 
| 
      
 167 
     | 
    
         
            +
             
     | 
| 
      
 168 
     | 
    
         
            +
                def scrap_episodes(info_hash)
         
     | 
| 
       165 
169 
     | 
    
         
             
                  episodes = []
         
     | 
| 
       166 
170 
     | 
    
         
             
                  doc = self.get_episodes_page(info_hash[:imdb_id])
         
     | 
| 
       167 
     | 
    
         
            -
             
     | 
| 
       168 
     | 
    
         
            -
                   
     | 
| 
       169 
     | 
    
         
            -
                     
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                  doc.css(".filter-all").each do |e_div|
         
     | 
| 
      
 173 
     | 
    
         
            +
                    next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
         
     | 
| 
       170 
174 
     | 
    
         
             
                      episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
         
     | 
| 
       171 
     | 
    
         
            -
             
     | 
| 
       172 
     | 
    
         
            -
             
     | 
| 
       173 
     | 
    
         
            -
             
     | 
| 
       174 
     | 
    
         
            -
             
     | 
| 
       175 
     | 
    
         
            -
                       
     | 
| 
       176 
     | 
    
         
            -
                      episodes << episode
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
                    raw_date = e_div.at_css('strong').inner_text.strip
         
     | 
| 
      
 177 
     | 
    
         
            +
                    episode['date'] = Date.parse(raw_date) rescue nil
         
     | 
| 
      
 178 
     | 
    
         
            +
                    if e_div.inner_text =~ /#{raw_date}/
         
     | 
| 
      
 179 
     | 
    
         
            +
                      episode['plot'] = $'.strip
         
     | 
| 
       177 
180 
     | 
    
         
             
                    end
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
                    episodes << episode
         
     | 
| 
       178 
183 
     | 
    
         
             
                  end
         
     | 
| 
       179 
184 
     | 
    
         
             
                  info_hash['episodes'] = episodes
         
     | 
| 
       180 
     | 
    
         
            -
               end
         
     | 
| 
       181 
     | 
    
         
            -
             
     | 
| 
       182 
     | 
    
         
            -
                def self.get_search_page(name)
         
     | 
| 
       183 
     | 
    
         
            -
                  Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
         
     | 
| 
       184 
185 
     | 
    
         
             
                end
         
     | 
| 
       185 
186 
     | 
    
         | 
| 
       186 
     | 
    
         
            -
             
     | 
| 
       187 
     | 
    
         
            -
             
     | 
| 
       188 
     | 
    
         
            -
             
     | 
| 
      
 187 
     | 
    
         
            +
                  def get_search_page(name)
         
     | 
| 
      
 188 
     | 
    
         
            +
                    Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
         
     | 
| 
      
 189 
     | 
    
         
            +
                  end
         
     | 
| 
       189 
190 
     | 
    
         | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
      
 191 
     | 
    
         
            +
                  def get_movie_page(imdb_id)
         
     | 
| 
      
 192 
     | 
    
         
            +
                    Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
         
     | 
| 
      
 193 
     | 
    
         
            +
                  end
         
     | 
| 
       193 
194 
     | 
    
         | 
| 
       194 
     | 
    
         
            -
             
     | 
| 
       195 
     | 
    
         
            -
             
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
      
 195 
     | 
    
         
            +
                  def get_episodes_page(imdb_id)
         
     | 
| 
      
 196 
     | 
    
         
            +
                    Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
         
     | 
| 
      
 197 
     | 
    
         
            +
                  end
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
                  def get_media_page(url_fragment)
         
     | 
| 
      
 200 
     | 
    
         
            +
                    Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
         
     | 
| 
      
 201 
     | 
    
         
            +
                   end
         
     | 
| 
       197 
202 
     | 
    
         | 
| 
       198 
     | 
    
         
            -
             
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
      
 203 
     | 
    
         
            +
                  def get_title_and_year_from_meta(doc)
         
     | 
| 
      
 204 
     | 
    
         
            +
                    title_text = doc.at_css("meta[name='title']").try(:[], 'content')
         
     | 
| 
      
 205 
     | 
    
         
            +
                    # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
         
     | 
| 
      
 206 
     | 
    
         
            +
                    if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
         
     | 
| 
      
 207 
     | 
    
         
            +
                      movie_title = self.clean_title($1)
         
     | 
| 
      
 208 
     | 
    
         
            +
                      movie_year = $2.to_i
         
     | 
| 
      
 209 
     | 
    
         
            +
                    end
         
     | 
| 
      
 210 
     | 
    
         
            +
                    return movie_title, movie_year
         
     | 
| 
      
 211 
     | 
    
         
            +
                  end  
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
                  # Remove surrounding double quotes that seems to appear on tv show name
         
     | 
| 
      
 214 
     | 
    
         
            +
                  def clean_title(movie_title)
         
     | 
| 
      
 215 
     | 
    
         
            +
                    movie_title = $1 if movie_title =~ /^"(.*)"$/
         
     | 
| 
      
 216 
     | 
    
         
            +
                    return movie_title.strip
         
     | 
| 
      
 217 
     | 
    
         
            +
                  end  
         
     | 
| 
       200 
218 
     | 
    
         | 
| 
       201 
     | 
    
         
            -
                   
     | 
| 
       202 
     | 
    
         
            -
                   
     | 
| 
       203 
     | 
    
         
            -
             
     | 
| 
       204 
     | 
    
         
            -
             
     | 
| 
       205 
     | 
    
         
            -
             
     | 
| 
       206 
     | 
    
         
            -
                  
         
     | 
| 
       207 
     | 
    
         
            -
                     
     | 
| 
      
 219 
     | 
    
         
            +
                  # Hackyness to get around ruby 1.9 encoding issue
         
     | 
| 
      
 220 
     | 
    
         
            +
                  def strip_whitespace(s)
         
     | 
| 
      
 221 
     | 
    
         
            +
                    s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
         
     | 
| 
      
 222 
     | 
    
         
            +
                  end  
         
     | 
| 
      
 223 
     | 
    
         
            +
                
         
     | 
| 
      
 224 
     | 
    
         
            +
                  def video_type(td)
         
     | 
| 
      
 225 
     | 
    
         
            +
                    return :tv_show if td.content =~ /\((TV series|TV)\)/
         
     | 
| 
      
 226 
     | 
    
         
            +
                    return :movie
         
     | 
| 
      
 227 
     | 
    
         
            +
                  end 
         
     | 
| 
      
 228 
     | 
    
         
            +
                
         
     | 
| 
      
 229 
     | 
    
         
            +
                  def video_type_from_meta(doc)
         
     | 
| 
      
 230 
     | 
    
         
            +
                    type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
         
     | 
| 
      
 231 
     | 
    
         
            +
                    type_text == 'tv_show' ? :tv_show : :movie
         
     | 
| 
       208 
232 
     | 
    
         
             
                  end
         
     | 
| 
       209 
     | 
    
         
            -
                  return movie_title, movie_year
         
     | 
| 
       210 
     | 
    
         
            -
                end  
         
     | 
| 
       211 
233 
     | 
    
         | 
| 
       212 
     | 
    
         
            -
                # Remove surrounding double quotes that seems to appear on tv show name
         
     | 
| 
       213 
     | 
    
         
            -
                def self.clean_title(movie_title)
         
     | 
| 
       214 
     | 
    
         
            -
                  movie_title = $1 if movie_title =~ /^"(.*)"$/
         
     | 
| 
       215 
     | 
    
         
            -
                  return movie_title.strip
         
     | 
| 
       216 
     | 
    
         
            -
                end  
         
     | 
| 
       217 
     | 
    
         
            -
              
         
     | 
| 
       218 
     | 
    
         
            -
                # Hackyness to get around ruby 1.9 encoding issue
         
     | 
| 
       219 
     | 
    
         
            -
                def self.strip_whitespace(s)
         
     | 
| 
       220 
     | 
    
         
            -
                  s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
         
     | 
| 
       221 
     | 
    
         
            -
                end  
         
     | 
| 
       222 
     | 
    
         
            -
              
         
     | 
| 
       223 
     | 
    
         
            -
                def self.video_type(td)
         
     | 
| 
       224 
     | 
    
         
            -
                  return :tv_show if td.content =~ /\((TV series|TV)\)/
         
     | 
| 
       225 
     | 
    
         
            -
                  return :movie
         
     | 
| 
       226 
     | 
    
         
            -
                end 
         
     | 
| 
       227 
     | 
    
         
            -
              
         
     | 
| 
       228 
     | 
    
         
            -
                def self.video_type_from_meta(doc)
         
     | 
| 
       229 
     | 
    
         
            -
                  meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
         
     | 
| 
       230 
     | 
    
         
            -
                  return :movie unless meta_type_tag.first
         
     | 
| 
       231 
     | 
    
         
            -
                  type_text = meta_type_tag.first['content']
         
     | 
| 
       232 
     | 
    
         
            -
                  case type_text
         
     | 
| 
       233 
     | 
    
         
            -
                    when 'tv_show' then return :tv_show
         
     | 
| 
       234 
     | 
    
         
            -
                    else return :movie
         
     | 
| 
       235 
     | 
    
         
            -
                  end   
         
     | 
| 
       236 
234 
     | 
    
         
             
                end
         
     | 
| 
       237 
235 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version 
     | 
|
| 
       4 
4 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       5 
5 
     | 
    
         
             
              segments: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              - 0
         
     | 
| 
       7 
     | 
    
         
            -
              -  
     | 
| 
       8 
     | 
    
         
            -
              -  
     | 
| 
       9 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 7 
     | 
    
         
            +
              - 2
         
     | 
| 
      
 8 
     | 
    
         
            +
              - 0
         
     | 
| 
      
 9 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       10 
10 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       11 
11 
     | 
    
         
             
            authors: 
         
     | 
| 
       12 
12 
     | 
    
         
             
            - Sam Cavenagh
         
     | 
| 
         @@ -14,7 +14,7 @@ autorequire: 
     | 
|
| 
       14 
14 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       15 
15 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
     | 
    
         
            -
            date: 2011- 
     | 
| 
      
 17 
     | 
    
         
            +
            date: 2011-03-06 00:00:00 +11:00
         
     | 
| 
       18 
18 
     | 
    
         
             
            default_executable: 
         
     | 
| 
       19 
19 
     | 
    
         
             
            dependencies: 
         
     | 
| 
       20 
20 
     | 
    
         
             
            - !ruby/object:Gem::Dependency 
         
     |