yayimdbs 0.1.10 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/lib/yay_imdbs.rb +180 -182
- metadata +4 -4
data/README.md
CHANGED
@@ -4,7 +4,7 @@ Overview
|
|
4
4
|
--------
|
5
5
|
Yet Another Ying IMDB Scraper
|
6
6
|
|
7
|
-
This is a simple imdb scraper, that
|
7
|
+
This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so I can share it across projects.
|
8
8
|
|
9
9
|
Features
|
10
10
|
--------
|
@@ -49,4 +49,4 @@ MIT
|
|
49
49
|
|
50
50
|
Contact
|
51
51
|
-------
|
52
|
-
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
52
|
+
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
data/lib/yay_imdbs.rb
CHANGED
@@ -18,220 +18,218 @@ class YayImdbs
|
|
18
18
|
|
19
19
|
STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
if movie_title
|
43
|
-
canonical_link = doc.xpath("//link[@rel='canonical']")
|
44
|
-
if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
|
45
|
-
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
|
46
|
-
else
|
47
|
-
raise "Unable to extract imdb id from exact search result"
|
21
|
+
DATE_PROPERTIES = [:release_date]
|
22
|
+
LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
|
23
|
+
INT_LIST_PROPERTIES = [:year, :season]
|
24
|
+
PROPERTY_ALIAS = {:genres => :genre,
|
25
|
+
:taglines => :tagline,
|
26
|
+
:year => :years,
|
27
|
+
:season => :seasons,
|
28
|
+
:language => :languages,
|
29
|
+
:motion_picture_rating_mpaa => :mpaa}
|
30
|
+
|
31
|
+
class << self
|
32
|
+
|
33
|
+
def search_for_imdb_id(name, year=nil, type=nil)
|
34
|
+
search_results = self.search_imdb(name)
|
35
|
+
|
36
|
+
search_results.each do |result|
|
37
|
+
# Ensure result is the correct video type
|
38
|
+
next if type && (result[:video_type] != type)
|
39
|
+
|
40
|
+
# If no year provided just return first result
|
41
|
+
return result[:imdb_id] if year.nil? || result[:year] == year
|
48
42
|
end
|
43
|
+
return nil
|
49
44
|
end
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
45
|
+
|
46
|
+
def search_imdb(search_term)
|
47
|
+
search_results = []
|
48
|
+
|
49
|
+
doc = self.get_search_page(search_term)
|
50
|
+
|
51
|
+
# If the search is an exact match imdb will redirect to the movie page not search results page
|
52
|
+
# we uses the title meta element to determine if we got an exact match
|
53
|
+
movie_title, movie_year = get_title_and_year_from_meta(doc)
|
54
|
+
if movie_title
|
55
|
+
canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
|
56
|
+
if canonical_link && canonical_link =~ /tt(\d+)\//
|
57
|
+
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
|
58
|
+
else
|
59
|
+
raise "Unable to extract imdb id from exact search result"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
doc.css("td").each do |td|
|
64
|
+
td.css("a").each do |link|
|
65
|
+
href = link['href']
|
66
|
+
current_name = link.content
|
67
|
+
|
68
|
+
# Ignore links with no text (e.g. image links) or links that don't link to movie pages
|
69
|
+
next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
|
61
70
|
imdb_id = $1
|
62
71
|
current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
|
63
|
-
search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type =>
|
72
|
+
search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
|
64
73
|
end
|
65
74
|
end
|
66
|
-
end
|
67
|
-
|
68
|
-
return search_results
|
69
|
-
end
|
70
|
-
|
71
|
-
def self.scrap_movie_info(imdb_id)
|
72
|
-
info_hash = {:imdb_id => imdb_id}.with_indifferent_access
|
73
|
-
|
74
|
-
doc = self.get_movie_page(imdb_id)
|
75
|
-
info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
|
76
|
-
if info_hash['title'].nil?
|
77
|
-
#If we cant get title and year something is wrong
|
78
|
-
raise "Unable to find title or year for imdb id #{imdb_id}"
|
79
|
-
end
|
80
|
-
info_hash['video_type'] = self.video_type_from_meta(doc)
|
81
75
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
76
|
+
return search_results
|
77
|
+
end
|
78
|
+
|
79
|
+
def scrap_movie_info(imdb_id)
|
80
|
+
info_hash = {:imdb_id => imdb_id}.with_indifferent_access
|
81
|
+
|
82
|
+
doc = self.get_movie_page(imdb_id)
|
83
|
+
title, year = get_title_and_year_from_meta(doc)
|
84
|
+
info_hash[:title], info_hash[:year] = title, year
|
85
|
+
if info_hash['title'].nil?
|
86
|
+
#If we cant get title and year something is wrong
|
87
|
+
raise "Unable to find title or year for imdb id #{imdb_id}"
|
88
|
+
end
|
89
|
+
info_hash[:video_type] = self.video_type_from_meta(doc)
|
92
90
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
91
|
+
info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
|
92
|
+
info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
|
93
|
+
|
94
|
+
found_info_divs = false
|
95
|
+
movie_properties(doc) do |key, value|
|
96
|
+
found_info_divs = true
|
97
|
+
info_hash["raw_#{key}"] = value
|
98
|
+
info_hash[key] = clean_movie_property(key, value)
|
99
|
+
info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
|
100
|
+
end
|
101
|
+
|
102
|
+
if not found_info_divs
|
103
|
+
#If we don't find any info divs assume parsing failed
|
104
|
+
raise "No info divs found for imdb id #{imdb_id}"
|
105
|
+
end
|
106
|
+
|
107
|
+
# Hack: tv shows can have a year property, which is a list, fixing ...
|
108
|
+
info_hash[:year] = year
|
109
|
+
|
110
|
+
self.scrap_images(doc, info_hash)
|
111
|
+
|
112
|
+
#scrap episodes if tv series
|
113
|
+
if info_hash.has_key?('season')
|
114
|
+
self.scrap_episodes(info_hash)
|
115
|
+
end
|
116
|
+
|
117
|
+
return info_hash
|
118
|
+
end
|
119
|
+
|
120
|
+
def clean_movie_property(key, value)
|
121
|
+
if DATE_PROPERTIES.include?(key)
|
122
|
+
value = Date.strptime(value, '%d %B %Y') rescue nil
|
123
|
+
elsif key == :runtime
|
101
124
|
if value =~ /(\d+)\smin/
|
102
125
|
value = $1.to_i
|
103
126
|
else
|
104
|
-
|
127
|
+
value = nil
|
105
128
|
end
|
106
|
-
elsif key
|
129
|
+
elsif LIST_PROPERTIES.include?(key)
|
107
130
|
value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
|
108
|
-
|
109
|
-
info_hash[:genre] = value
|
110
|
-
elsif key == 'year'
|
131
|
+
elsif INT_LIST_PROPERTIES.include?(key)
|
111
132
|
value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
|
112
|
-
# TV shows can have multiple years
|
113
|
-
info_hash[:years] = value
|
114
|
-
value = value.sort.first
|
115
|
-
elsif key == 'language'
|
116
|
-
value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
|
117
|
-
elsif key == 'taglines'
|
118
|
-
# Backwards compatibility
|
119
|
-
info_hash['tagline'] = value
|
120
|
-
elsif key == 'motion picture rating (mpaa)'
|
121
|
-
value = value.gsub(/See all certifications/, '').strip
|
122
|
-
# Backwards compatibility FIXME do with a map
|
123
|
-
info_hash['mpaa'] = value
|
124
133
|
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
if not found_info_divs
|
129
|
-
#If we don't find any info divs assume parsing failed
|
130
|
-
raise "No info divs found for imdb id #{imdb_id}"
|
131
|
-
end
|
132
|
-
|
133
|
-
self.scrap_images(doc, info_hash)
|
134
|
-
|
135
|
-
#scrap episodes if tv series
|
136
|
-
if info_hash.has_key?('season')
|
137
|
-
self.scrap_episodes(info_hash)
|
134
|
+
return value
|
138
135
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# Small thumbnail image, gotten by hacking medium url
|
153
|
-
info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
|
154
|
-
|
155
|
-
#Try to scrap a larger version of the image url
|
156
|
-
large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
|
157
|
-
large_img_doc = self.get_media_page(large_img_page)
|
158
|
-
large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
|
159
|
-
info_hash['large_image'] = large_img_url
|
136
|
+
|
137
|
+
def movie_properties(doc)
|
138
|
+
doc.css("div h4").each do |h4|
|
139
|
+
div = h4.parent
|
140
|
+
raw_key = h4.inner_text
|
141
|
+
key = raw_key.sub(':', '').strip.downcase
|
142
|
+
value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
|
143
|
+
value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
|
144
|
+
|
145
|
+
symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
|
146
|
+
|
147
|
+
yield symbol_key, value
|
160
148
|
end
|
161
149
|
end
|
162
|
-
end
|
163
150
|
|
164
|
-
|
151
|
+
def scrap_images(doc, info_hash)
|
152
|
+
#scrap poster image urls
|
153
|
+
thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
|
154
|
+
return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
|
155
|
+
|
156
|
+
info_hash['medium_image'] = thumbnail_url
|
157
|
+
# Small thumbnail image, gotten by hacking medium url
|
158
|
+
info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
|
159
|
+
|
160
|
+
#Try to scrap a larger version of the image url
|
161
|
+
large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
|
162
|
+
return unless large_img_page_link
|
163
|
+
large_img_doc = get_media_page(large_img_page_link)
|
164
|
+
large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
|
165
|
+
info_hash['large_image'] = large_img_url
|
166
|
+
end
|
167
|
+
|
168
|
+
def scrap_episodes(info_hash)
|
165
169
|
episodes = []
|
166
170
|
doc = self.get_episodes_page(info_hash[:imdb_id])
|
167
|
-
|
168
|
-
|
169
|
-
|
171
|
+
|
172
|
+
doc.css(".filter-all").each do |e_div|
|
173
|
+
next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
|
170
174
|
episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
episodes << episode
|
175
|
+
|
176
|
+
raw_date = e_div.at_css('strong').inner_text.strip
|
177
|
+
episode['date'] = Date.parse(raw_date) rescue nil
|
178
|
+
if e_div.inner_text =~ /#{raw_date}/
|
179
|
+
episode['plot'] = $'.strip
|
177
180
|
end
|
181
|
+
|
182
|
+
episodes << episode
|
178
183
|
end
|
179
184
|
info_hash['episodes'] = episodes
|
180
|
-
end
|
181
|
-
|
182
|
-
def self.get_search_page(name)
|
183
|
-
Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
184
185
|
end
|
185
186
|
|
186
|
-
|
187
|
-
|
188
|
-
|
187
|
+
def get_search_page(name)
|
188
|
+
Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
189
|
+
end
|
189
190
|
|
190
|
-
|
191
|
-
|
192
|
-
|
191
|
+
def get_movie_page(imdb_id)
|
192
|
+
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
|
193
|
+
end
|
193
194
|
|
194
|
-
|
195
|
-
|
196
|
-
|
195
|
+
def get_episodes_page(imdb_id)
|
196
|
+
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
|
197
|
+
end
|
198
|
+
|
199
|
+
def get_media_page(url_fragment)
|
200
|
+
Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
|
201
|
+
end
|
197
202
|
|
198
|
-
|
199
|
-
|
203
|
+
def get_title_and_year_from_meta(doc)
|
204
|
+
title_text = doc.at_css("meta[name='title']").try(:[], 'content')
|
205
|
+
# Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
|
206
|
+
if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
|
207
|
+
movie_title = self.clean_title($1)
|
208
|
+
movie_year = $2.to_i
|
209
|
+
end
|
210
|
+
return movie_title, movie_year
|
211
|
+
end
|
212
|
+
|
213
|
+
# Remove surrounding double quotes that seems to appear on tv show name
|
214
|
+
def clean_title(movie_title)
|
215
|
+
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
216
|
+
return movie_title.strip
|
217
|
+
end
|
200
218
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
219
|
+
# Hackyness to get around ruby 1.9 encoding issue
|
220
|
+
def strip_whitespace(s)
|
221
|
+
s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
|
222
|
+
end
|
223
|
+
|
224
|
+
def video_type(td)
|
225
|
+
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
226
|
+
return :movie
|
227
|
+
end
|
228
|
+
|
229
|
+
def video_type_from_meta(doc)
|
230
|
+
type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
|
231
|
+
type_text == 'tv_show' ? :tv_show : :movie
|
208
232
|
end
|
209
|
-
return movie_title, movie_year
|
210
|
-
end
|
211
233
|
|
212
|
-
# Remove surrounding double quotes that seems to appear on tv show name
|
213
|
-
def self.clean_title(movie_title)
|
214
|
-
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
215
|
-
return movie_title.strip
|
216
|
-
end
|
217
|
-
|
218
|
-
# Hackyness to get around ruby 1.9 encoding issue
|
219
|
-
def self.strip_whitespace(s)
|
220
|
-
s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
|
221
|
-
end
|
222
|
-
|
223
|
-
def self.video_type(td)
|
224
|
-
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
225
|
-
return :movie
|
226
|
-
end
|
227
|
-
|
228
|
-
def self.video_type_from_meta(doc)
|
229
|
-
meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
|
230
|
-
return :movie unless meta_type_tag.first
|
231
|
-
type_text = meta_type_tag.first['content']
|
232
|
-
case type_text
|
233
|
-
when 'tv_show' then return :tv_show
|
234
|
-
else return :movie
|
235
|
-
end
|
236
234
|
end
|
237
235
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Sam Cavenagh
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-06 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|