yayimdbs 0.1.10 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README.md +2 -2
  2. data/lib/yay_imdbs.rb +180 -182
  3. metadata +4 -4
data/README.md CHANGED
@@ -4,7 +4,7 @@ Overview
4
4
  --------
5
5
  Yet Another Ying IMDB Scraper
6
6
 
7
- This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
7
+ This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so I can share it across projects.
8
8
 
9
9
  Features
10
10
  --------
@@ -49,4 +49,4 @@ MIT
49
49
 
50
50
  Contact
51
51
  -------
52
- Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
52
+ Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
data/lib/yay_imdbs.rb CHANGED
@@ -18,220 +18,218 @@ class YayImdbs
18
18
 
19
19
  STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
20
20
 
21
- def self.search_for_imdb_id(name, year=nil, type=nil)
22
- search_results = self.search_imdb(name)
23
- return nil if search_results.empty?
24
-
25
- search_results.each do |result|
26
- # Ensure result is the correct video type
27
- next if type && (result[:video_type] != type)
28
-
29
- # If no year provided just return first result
30
- return result[:imdb_id] if !year || result[:year] == year
31
- end
32
- return nil
33
- end
34
-
35
- def self.search_imdb(search_term)
36
- search_results = []
37
-
38
- doc = self.get_search_page(search_term)
39
- # If the search is an exact match imdb will redirect to the movie page not search results page
40
- # we uses the the title meta element to determine if we got an exact match
41
- movie_title, movie_year = get_title_and_year_from_meta(doc)
42
- if movie_title
43
- canonical_link = doc.xpath("//link[@rel='canonical']")
44
- if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
45
- return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
46
- else
47
- raise "Unable to extract imdb id from exact search result"
21
+ DATE_PROPERTIES = [:release_date]
22
+ LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
23
+ INT_LIST_PROPERTIES = [:year, :season]
24
+ PROPERTY_ALIAS = {:genres => :genre,
25
+ :taglines => :tagline,
26
+ :year => :years,
27
+ :season => :seasons,
28
+ :language => :languages,
29
+ :motion_picture_rating_mpaa => :mpaa}
30
+
31
+ class << self
32
+
33
+ def search_for_imdb_id(name, year=nil, type=nil)
34
+ search_results = self.search_imdb(name)
35
+
36
+ search_results.each do |result|
37
+ # Ensure result is the correct video type
38
+ next if type && (result[:video_type] != type)
39
+
40
+ # If no year provided just return first result
41
+ return result[:imdb_id] if year.nil? || result[:year] == year
48
42
  end
43
+ return nil
49
44
  end
50
-
51
- doc.xpath("//td").each do |td|
52
- td.xpath(".//a").each do |link|
53
- href = link['href']
54
- current_name = link.content
55
-
56
- # Ignore links with no text (e.g. image links)
57
- next unless current_name.present?
58
- current_name = self.clean_title(current_name)
59
-
60
- if href =~ /^\/title\/tt(\d+)/
45
+
46
+ def search_imdb(search_term)
47
+ search_results = []
48
+
49
+ doc = self.get_search_page(search_term)
50
+
51
+ # If the search is an exact match imdb will redirect to the movie page not search results page
52
+ # we uses the title meta element to determine if we got an exact match
53
+ movie_title, movie_year = get_title_and_year_from_meta(doc)
54
+ if movie_title
55
+ canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
56
+ if canonical_link && canonical_link =~ /tt(\d+)\//
57
+ return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
58
+ else
59
+ raise "Unable to extract imdb id from exact search result"
60
+ end
61
+ end
62
+
63
+ doc.css("td").each do |td|
64
+ td.css("a").each do |link|
65
+ href = link['href']
66
+ current_name = link.content
67
+
68
+ # Ignore links with no text (e.g. image links) or links that don't link to movie pages
69
+ next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
61
70
  imdb_id = $1
62
71
  current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
63
- search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
72
+ search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
64
73
  end
65
74
  end
66
- end
67
-
68
- return search_results
69
- end
70
-
71
- def self.scrap_movie_info(imdb_id)
72
- info_hash = {:imdb_id => imdb_id}.with_indifferent_access
73
-
74
- doc = self.get_movie_page(imdb_id)
75
- info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
76
- if info_hash['title'].nil?
77
- #If we cant get title and year something is wrong
78
- raise "Unable to find title or year for imdb id #{imdb_id}"
79
- end
80
- info_hash['video_type'] = self.video_type_from_meta(doc)
81
75
 
82
- info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
83
-
84
- found_info_divs = false
85
- doc.xpath("//div/h4").each do |h4|
86
- div = h4.parent
87
- found_info_divs = true
88
- raw_key = h4.inner_text
89
- key = raw_key.sub(':', '').strip.downcase
90
- value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
91
- value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)$/, '').strip
76
+ return search_results
77
+ end
78
+
79
+ def scrap_movie_info(imdb_id)
80
+ info_hash = {:imdb_id => imdb_id}.with_indifferent_access
81
+
82
+ doc = self.get_movie_page(imdb_id)
83
+ title, year = get_title_and_year_from_meta(doc)
84
+ info_hash[:title], info_hash[:year] = title, year
85
+ if info_hash['title'].nil?
86
+ #If we cant get title and year something is wrong
87
+ raise "Unable to find title or year for imdb id #{imdb_id}"
88
+ end
89
+ info_hash[:video_type] = self.video_type_from_meta(doc)
92
90
 
93
- if key == 'release date'
94
- begin
95
- value = Date.strptime(value, '%d %B %Y')
96
- rescue
97
- p "Invalid date '#{value}' for imdb id: #{imdb_id}"
98
- value = nil
99
- end
100
- elsif key == 'runtime'
91
+ info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
92
+ info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
93
+
94
+ found_info_divs = false
95
+ movie_properties(doc) do |key, value|
96
+ found_info_divs = true
97
+ info_hash["raw_#{key}"] = value
98
+ info_hash[key] = clean_movie_property(key, value)
99
+ info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
100
+ end
101
+
102
+ if not found_info_divs
103
+ #If we don't find any info divs assume parsing failed
104
+ raise "No info divs found for imdb id #{imdb_id}"
105
+ end
106
+
107
+ # Hack: tv shows can have a year property, which is a list, fixing ...
108
+ info_hash[:year] = year
109
+
110
+ self.scrap_images(doc, info_hash)
111
+
112
+ #scrap episodes if tv series
113
+ if info_hash.has_key?('season')
114
+ self.scrap_episodes(info_hash)
115
+ end
116
+
117
+ return info_hash
118
+ end
119
+
120
+ def clean_movie_property(key, value)
121
+ if DATE_PROPERTIES.include?(key)
122
+ value = Date.strptime(value, '%d %B %Y') rescue nil
123
+ elsif key == :runtime
101
124
  if value =~ /(\d+)\smin/
102
125
  value = $1.to_i
103
126
  else
104
- p "Unexpected runtime format #{value} for movie #{imdb_id}"
127
+ value = nil
105
128
  end
106
- elsif key == 'genres'
129
+ elsif LIST_PROPERTIES.include?(key)
107
130
  value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
108
- # Backwards compatibility hack
109
- info_hash[:genre] = value
110
- elsif key == 'year'
131
+ elsif INT_LIST_PROPERTIES.include?(key)
111
132
  value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
112
- # TV shows can have multiple years
113
- info_hash[:years] = value
114
- value = value.sort.first
115
- elsif key == 'language'
116
- value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
117
- elsif key == 'taglines'
118
- # Backwards compatibility
119
- info_hash['tagline'] = value
120
- elsif key == 'motion picture rating (mpaa)'
121
- value = value.gsub(/See all certifications/, '').strip
122
- # Backwards compatibility FIXME do with a map
123
- info_hash['mpaa'] = value
124
133
  end
125
- info_hash[key.downcase.gsub(/\s/, '_')] = value
126
- end
127
-
128
- if not found_info_divs
129
- #If we don't find any info divs assume parsing failed
130
- raise "No info divs found for imdb id #{imdb_id}"
131
- end
132
-
133
- self.scrap_images(doc, info_hash)
134
-
135
- #scrap episodes if tv series
136
- if info_hash.has_key?('season')
137
- self.scrap_episodes(info_hash)
134
+ return value
138
135
  end
139
-
140
- return info_hash
141
- end
142
-
143
- private
144
- def self.scrap_images(doc, info_hash)
145
- #scrap poster image urls
146
- thumb = doc.xpath("//td[@id = 'img_primary']/a/img")
147
- if thumb.first
148
- thumbnail_url = thumb.first['src']
149
- if not thumbnail_url =~ /\/nopicture\//
150
- info_hash['medium_image'] = thumbnail_url
151
-
152
- # Small thumbnail image, gotten by hacking medium url
153
- info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
154
-
155
- #Try to scrap a larger version of the image url
156
- large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
157
- large_img_doc = self.get_media_page(large_img_page)
158
- large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
159
- info_hash['large_image'] = large_img_url
136
+
137
+ def movie_properties(doc)
138
+ doc.css("div h4").each do |h4|
139
+ div = h4.parent
140
+ raw_key = h4.inner_text
141
+ key = raw_key.sub(':', '').strip.downcase
142
+ value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
143
+ value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
144
+
145
+ symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
146
+
147
+ yield symbol_key, value
160
148
  end
161
149
  end
162
- end
163
150
 
164
- def self.scrap_episodes(info_hash)
151
+ def scrap_images(doc, info_hash)
152
+ #scrap poster image urls
153
+ thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
154
+ return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
155
+
156
+ info_hash['medium_image'] = thumbnail_url
157
+ # Small thumbnail image, gotten by hacking medium url
158
+ info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
159
+
160
+ #Try to scrap a larger version of the image url
161
+ large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
162
+ return unless large_img_page_link
163
+ large_img_doc = get_media_page(large_img_page_link)
164
+ large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
165
+ info_hash['large_image'] = large_img_url
166
+ end
167
+
168
+ def scrap_episodes(info_hash)
165
169
  episodes = []
166
170
  doc = self.get_episodes_page(info_hash[:imdb_id])
167
- episode_divs = doc.css(".filter-all")
168
- episode_divs.each do |e_div|
169
- if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
171
+
172
+ doc.css(".filter-all").each do |e_div|
173
+ next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
170
174
  episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
171
- raw_date = e_div.xpath('.//span/strong').inner_text.strip
172
- episode['date'] = Date.parse(raw_date)
173
- if e_div.inner_text =~ /#{raw_date}/
174
- episode['plot'] = $'.strip
175
- end
176
- episodes << episode
175
+
176
+ raw_date = e_div.at_css('strong').inner_text.strip
177
+ episode['date'] = Date.parse(raw_date) rescue nil
178
+ if e_div.inner_text =~ /#{raw_date}/
179
+ episode['plot'] = $'.strip
177
180
  end
181
+
182
+ episodes << episode
178
183
  end
179
184
  info_hash['episodes'] = episodes
180
- end
181
-
182
- def self.get_search_page(name)
183
- Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
184
185
  end
185
186
 
186
- def self.get_movie_page(imdb_id)
187
- Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
188
- end
187
+ def get_search_page(name)
188
+ Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
189
+ end
189
190
 
190
- def self.get_episodes_page(imdb_id)
191
- Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
192
- end
191
+ def get_movie_page(imdb_id)
192
+ Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
193
+ end
193
194
 
194
- def self.get_media_page(url_fragment)
195
- Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
196
- end
195
+ def get_episodes_page(imdb_id)
196
+ Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
197
+ end
198
+
199
+ def get_media_page(url_fragment)
200
+ Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
201
+ end
197
202
 
198
- def self.get_title_and_year_from_meta(doc)
199
- return nil, nil unless doc.xpath("//meta[@name='title']").first
203
+ def get_title_and_year_from_meta(doc)
204
+ title_text = doc.at_css("meta[name='title']").try(:[], 'content')
205
+ # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
206
+ if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
207
+ movie_title = self.clean_title($1)
208
+ movie_year = $2.to_i
209
+ end
210
+ return movie_title, movie_year
211
+ end
212
+
213
+ # Remove surrounding double quotes that seems to appear on tv show name
214
+ def clean_title(movie_title)
215
+ movie_title = $1 if movie_title =~ /^"(.*)"$/
216
+ return movie_title.strip
217
+ end
200
218
 
201
- title_text = doc.xpath("//meta[@name='title']").first['content']
202
- # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
203
- if title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
204
- movie_title = $1
205
- movie_year = $2.to_i
206
-
207
- movie_title = self.clean_title(movie_title)
219
+ # Hackyness to get around ruby 1.9 encoding issue
220
+ def strip_whitespace(s)
221
+ s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
222
+ end
223
+
224
+ def video_type(td)
225
+ return :tv_show if td.content =~ /\((TV series|TV)\)/
226
+ return :movie
227
+ end
228
+
229
+ def video_type_from_meta(doc)
230
+ type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
231
+ type_text == 'tv_show' ? :tv_show : :movie
208
232
  end
209
- return movie_title, movie_year
210
- end
211
233
 
212
- # Remove surrounding double quotes that seems to appear on tv show name
213
- def self.clean_title(movie_title)
214
- movie_title = $1 if movie_title =~ /^"(.*)"$/
215
- return movie_title.strip
216
- end
217
-
218
- # Hackyness to get around ruby 1.9 encoding issue
219
- def self.strip_whitespace(s)
220
- s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
221
- end
222
-
223
- def self.video_type(td)
224
- return :tv_show if td.content =~ /\((TV series|TV)\)/
225
- return :movie
226
- end
227
-
228
- def self.video_type_from_meta(doc)
229
- meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
230
- return :movie unless meta_type_tag.first
231
- type_text = meta_type_tag.first['content']
232
- case type_text
233
- when 'tv_show' then return :tv_show
234
- else return :movie
235
- end
236
234
  end
237
235
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 10
9
- version: 0.1.10
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Sam Cavenagh
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-12 00:00:00 +11:00
17
+ date: 2011-03-06 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency