yayimdbs 0.1.10 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README.md +2 -2
  2. data/lib/yay_imdbs.rb +180 -182
  3. metadata +4 -4
data/README.md CHANGED
@@ -4,7 +4,7 @@ Overview
4
4
  --------
5
5
  Yet Another Ying IMDB Scraper
6
6
 
7
- This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
7
+ This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so I can share it across projects.
8
8
 
9
9
  Features
10
10
  --------
@@ -49,4 +49,4 @@ MIT
49
49
 
50
50
  Contact
51
51
  -------
52
- Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
52
+ Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
data/lib/yay_imdbs.rb CHANGED
@@ -18,220 +18,218 @@ class YayImdbs
18
18
 
19
19
  STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
20
20
 
21
- def self.search_for_imdb_id(name, year=nil, type=nil)
22
- search_results = self.search_imdb(name)
23
- return nil if search_results.empty?
24
-
25
- search_results.each do |result|
26
- # Ensure result is the correct video type
27
- next if type && (result[:video_type] != type)
28
-
29
- # If no year provided just return first result
30
- return result[:imdb_id] if !year || result[:year] == year
31
- end
32
- return nil
33
- end
34
-
35
- def self.search_imdb(search_term)
36
- search_results = []
37
-
38
- doc = self.get_search_page(search_term)
39
- # If the search is an exact match imdb will redirect to the movie page not search results page
40
- # we uses the the title meta element to determine if we got an exact match
41
- movie_title, movie_year = get_title_and_year_from_meta(doc)
42
- if movie_title
43
- canonical_link = doc.xpath("//link[@rel='canonical']")
44
- if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
45
- return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
46
- else
47
- raise "Unable to extract imdb id from exact search result"
21
+ DATE_PROPERTIES = [:release_date]
22
+ LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
23
+ INT_LIST_PROPERTIES = [:year, :season]
24
+ PROPERTY_ALIAS = {:genres => :genre,
25
+ :taglines => :tagline,
26
+ :year => :years,
27
+ :season => :seasons,
28
+ :language => :languages,
29
+ :motion_picture_rating_mpaa => :mpaa}
30
+
31
+ class << self
32
+
33
+ def search_for_imdb_id(name, year=nil, type=nil)
34
+ search_results = self.search_imdb(name)
35
+
36
+ search_results.each do |result|
37
+ # Ensure result is the correct video type
38
+ next if type && (result[:video_type] != type)
39
+
40
+ # If no year provided just return first result
41
+ return result[:imdb_id] if year.nil? || result[:year] == year
48
42
  end
43
+ return nil
49
44
  end
50
-
51
- doc.xpath("//td").each do |td|
52
- td.xpath(".//a").each do |link|
53
- href = link['href']
54
- current_name = link.content
55
-
56
- # Ignore links with no text (e.g. image links)
57
- next unless current_name.present?
58
- current_name = self.clean_title(current_name)
59
-
60
- if href =~ /^\/title\/tt(\d+)/
45
+
46
+ def search_imdb(search_term)
47
+ search_results = []
48
+
49
+ doc = self.get_search_page(search_term)
50
+
51
+ # If the search is an exact match imdb will redirect to the movie page not search results page
52
+ # we uses the title meta element to determine if we got an exact match
53
+ movie_title, movie_year = get_title_and_year_from_meta(doc)
54
+ if movie_title
55
+ canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
56
+ if canonical_link && canonical_link =~ /tt(\d+)\//
57
+ return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
58
+ else
59
+ raise "Unable to extract imdb id from exact search result"
60
+ end
61
+ end
62
+
63
+ doc.css("td").each do |td|
64
+ td.css("a").each do |link|
65
+ href = link['href']
66
+ current_name = link.content
67
+
68
+ # Ignore links with no text (e.g. image links) or links that don't link to movie pages
69
+ next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
61
70
  imdb_id = $1
62
71
  current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
63
- search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
72
+ search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
64
73
  end
65
74
  end
66
- end
67
-
68
- return search_results
69
- end
70
-
71
- def self.scrap_movie_info(imdb_id)
72
- info_hash = {:imdb_id => imdb_id}.with_indifferent_access
73
-
74
- doc = self.get_movie_page(imdb_id)
75
- info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
76
- if info_hash['title'].nil?
77
- #If we cant get title and year something is wrong
78
- raise "Unable to find title or year for imdb id #{imdb_id}"
79
- end
80
- info_hash['video_type'] = self.video_type_from_meta(doc)
81
75
 
82
- info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
83
-
84
- found_info_divs = false
85
- doc.xpath("//div/h4").each do |h4|
86
- div = h4.parent
87
- found_info_divs = true
88
- raw_key = h4.inner_text
89
- key = raw_key.sub(':', '').strip.downcase
90
- value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
91
- value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)$/, '').strip
76
+ return search_results
77
+ end
78
+
79
+ def scrap_movie_info(imdb_id)
80
+ info_hash = {:imdb_id => imdb_id}.with_indifferent_access
81
+
82
+ doc = self.get_movie_page(imdb_id)
83
+ title, year = get_title_and_year_from_meta(doc)
84
+ info_hash[:title], info_hash[:year] = title, year
85
+ if info_hash['title'].nil?
86
+ #If we cant get title and year something is wrong
87
+ raise "Unable to find title or year for imdb id #{imdb_id}"
88
+ end
89
+ info_hash[:video_type] = self.video_type_from_meta(doc)
92
90
 
93
- if key == 'release date'
94
- begin
95
- value = Date.strptime(value, '%d %B %Y')
96
- rescue
97
- p "Invalid date '#{value}' for imdb id: #{imdb_id}"
98
- value = nil
99
- end
100
- elsif key == 'runtime'
91
+ info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
92
+ info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
93
+
94
+ found_info_divs = false
95
+ movie_properties(doc) do |key, value|
96
+ found_info_divs = true
97
+ info_hash["raw_#{key}"] = value
98
+ info_hash[key] = clean_movie_property(key, value)
99
+ info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
100
+ end
101
+
102
+ if not found_info_divs
103
+ #If we don't find any info divs assume parsing failed
104
+ raise "No info divs found for imdb id #{imdb_id}"
105
+ end
106
+
107
+ # Hack: tv shows can have a year property, which is a list, fixing ...
108
+ info_hash[:year] = year
109
+
110
+ self.scrap_images(doc, info_hash)
111
+
112
+ #scrap episodes if tv series
113
+ if info_hash.has_key?('season')
114
+ self.scrap_episodes(info_hash)
115
+ end
116
+
117
+ return info_hash
118
+ end
119
+
120
+ def clean_movie_property(key, value)
121
+ if DATE_PROPERTIES.include?(key)
122
+ value = Date.strptime(value, '%d %B %Y') rescue nil
123
+ elsif key == :runtime
101
124
  if value =~ /(\d+)\smin/
102
125
  value = $1.to_i
103
126
  else
104
- p "Unexpected runtime format #{value} for movie #{imdb_id}"
127
+ value = nil
105
128
  end
106
- elsif key == 'genres'
129
+ elsif LIST_PROPERTIES.include?(key)
107
130
  value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
108
- # Backwards compatibility hack
109
- info_hash[:genre] = value
110
- elsif key == 'year'
131
+ elsif INT_LIST_PROPERTIES.include?(key)
111
132
  value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
112
- # TV shows can have multiple years
113
- info_hash[:years] = value
114
- value = value.sort.first
115
- elsif key == 'language'
116
- value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
117
- elsif key == 'taglines'
118
- # Backwards compatibility
119
- info_hash['tagline'] = value
120
- elsif key == 'motion picture rating (mpaa)'
121
- value = value.gsub(/See all certifications/, '').strip
122
- # Backwards compatibility FIXME do with a map
123
- info_hash['mpaa'] = value
124
133
  end
125
- info_hash[key.downcase.gsub(/\s/, '_')] = value
126
- end
127
-
128
- if not found_info_divs
129
- #If we don't find any info divs assume parsing failed
130
- raise "No info divs found for imdb id #{imdb_id}"
131
- end
132
-
133
- self.scrap_images(doc, info_hash)
134
-
135
- #scrap episodes if tv series
136
- if info_hash.has_key?('season')
137
- self.scrap_episodes(info_hash)
134
+ return value
138
135
  end
139
-
140
- return info_hash
141
- end
142
-
143
- private
144
- def self.scrap_images(doc, info_hash)
145
- #scrap poster image urls
146
- thumb = doc.xpath("//td[@id = 'img_primary']/a/img")
147
- if thumb.first
148
- thumbnail_url = thumb.first['src']
149
- if not thumbnail_url =~ /\/nopicture\//
150
- info_hash['medium_image'] = thumbnail_url
151
-
152
- # Small thumbnail image, gotten by hacking medium url
153
- info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
154
-
155
- #Try to scrap a larger version of the image url
156
- large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
157
- large_img_doc = self.get_media_page(large_img_page)
158
- large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
159
- info_hash['large_image'] = large_img_url
136
+
137
+ def movie_properties(doc)
138
+ doc.css("div h4").each do |h4|
139
+ div = h4.parent
140
+ raw_key = h4.inner_text
141
+ key = raw_key.sub(':', '').strip.downcase
142
+ value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
143
+ value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
144
+
145
+ symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
146
+
147
+ yield symbol_key, value
160
148
  end
161
149
  end
162
- end
163
150
 
164
- def self.scrap_episodes(info_hash)
151
+ def scrap_images(doc, info_hash)
152
+ #scrap poster image urls
153
+ thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
154
+ return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
155
+
156
+ info_hash['medium_image'] = thumbnail_url
157
+ # Small thumbnail image, gotten by hacking medium url
158
+ info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
159
+
160
+ #Try to scrap a larger version of the image url
161
+ large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
162
+ return unless large_img_page_link
163
+ large_img_doc = get_media_page(large_img_page_link)
164
+ large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
165
+ info_hash['large_image'] = large_img_url
166
+ end
167
+
168
+ def scrap_episodes(info_hash)
165
169
  episodes = []
166
170
  doc = self.get_episodes_page(info_hash[:imdb_id])
167
- episode_divs = doc.css(".filter-all")
168
- episode_divs.each do |e_div|
169
- if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
171
+
172
+ doc.css(".filter-all").each do |e_div|
173
+ next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
170
174
  episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
171
- raw_date = e_div.xpath('.//span/strong').inner_text.strip
172
- episode['date'] = Date.parse(raw_date)
173
- if e_div.inner_text =~ /#{raw_date}/
174
- episode['plot'] = $'.strip
175
- end
176
- episodes << episode
175
+
176
+ raw_date = e_div.at_css('strong').inner_text.strip
177
+ episode['date'] = Date.parse(raw_date) rescue nil
178
+ if e_div.inner_text =~ /#{raw_date}/
179
+ episode['plot'] = $'.strip
177
180
  end
181
+
182
+ episodes << episode
178
183
  end
179
184
  info_hash['episodes'] = episodes
180
- end
181
-
182
- def self.get_search_page(name)
183
- Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
184
185
  end
185
186
 
186
- def self.get_movie_page(imdb_id)
187
- Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
188
- end
187
+ def get_search_page(name)
188
+ Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
189
+ end
189
190
 
190
- def self.get_episodes_page(imdb_id)
191
- Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
192
- end
191
+ def get_movie_page(imdb_id)
192
+ Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
193
+ end
193
194
 
194
- def self.get_media_page(url_fragment)
195
- Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
196
- end
195
+ def get_episodes_page(imdb_id)
196
+ Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
197
+ end
198
+
199
+ def get_media_page(url_fragment)
200
+ Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
201
+ end
197
202
 
198
- def self.get_title_and_year_from_meta(doc)
199
- return nil, nil unless doc.xpath("//meta[@name='title']").first
203
+ def get_title_and_year_from_meta(doc)
204
+ title_text = doc.at_css("meta[name='title']").try(:[], 'content')
205
+ # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
206
+ if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
207
+ movie_title = self.clean_title($1)
208
+ movie_year = $2.to_i
209
+ end
210
+ return movie_title, movie_year
211
+ end
212
+
213
+ # Remove surrounding double quotes that seems to appear on tv show name
214
+ def clean_title(movie_title)
215
+ movie_title = $1 if movie_title =~ /^"(.*)"$/
216
+ return movie_title.strip
217
+ end
200
218
 
201
- title_text = doc.xpath("//meta[@name='title']").first['content']
202
- # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
203
- if title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
204
- movie_title = $1
205
- movie_year = $2.to_i
206
-
207
- movie_title = self.clean_title(movie_title)
219
+ # Hackyness to get around ruby 1.9 encoding issue
220
+ def strip_whitespace(s)
221
+ s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
222
+ end
223
+
224
+ def video_type(td)
225
+ return :tv_show if td.content =~ /\((TV series|TV)\)/
226
+ return :movie
227
+ end
228
+
229
+ def video_type_from_meta(doc)
230
+ type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
231
+ type_text == 'tv_show' ? :tv_show : :movie
208
232
  end
209
- return movie_title, movie_year
210
- end
211
233
 
212
- # Remove surrounding double quotes that seems to appear on tv show name
213
- def self.clean_title(movie_title)
214
- movie_title = $1 if movie_title =~ /^"(.*)"$/
215
- return movie_title.strip
216
- end
217
-
218
- # Hackyness to get around ruby 1.9 encoding issue
219
- def self.strip_whitespace(s)
220
- s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
221
- end
222
-
223
- def self.video_type(td)
224
- return :tv_show if td.content =~ /\((TV series|TV)\)/
225
- return :movie
226
- end
227
-
228
- def self.video_type_from_meta(doc)
229
- meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
230
- return :movie unless meta_type_tag.first
231
- type_text = meta_type_tag.first['content']
232
- case type_text
233
- when 'tv_show' then return :tv_show
234
- else return :movie
235
- end
236
234
  end
237
235
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 10
9
- version: 0.1.10
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Sam Cavenagh
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-12 00:00:00 +11:00
17
+ date: 2011-03-06 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency