yayimdbs 0.1.10 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/lib/yay_imdbs.rb +180 -182
- metadata +4 -4
data/README.md
CHANGED
@@ -4,7 +4,7 @@ Overview
|
|
4
4
|
--------
|
5
5
|
Yet Another Ying IMDB Scraper
|
6
6
|
|
7
|
-
This is a simple imdb scraper, that
|
7
|
+
This is a simple imdb scraper, that I created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so I can share it across projects.
|
8
8
|
|
9
9
|
Features
|
10
10
|
--------
|
@@ -49,4 +49,4 @@ MIT
|
|
49
49
|
|
50
50
|
Contact
|
51
51
|
-------
|
52
|
-
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
52
|
+
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
data/lib/yay_imdbs.rb
CHANGED
@@ -18,220 +18,218 @@ class YayImdbs
|
|
18
18
|
|
19
19
|
STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
if movie_title
|
43
|
-
canonical_link = doc.xpath("//link[@rel='canonical']")
|
44
|
-
if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
|
45
|
-
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
|
46
|
-
else
|
47
|
-
raise "Unable to extract imdb id from exact search result"
|
21
|
+
DATE_PROPERTIES = [:release_date]
|
22
|
+
LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
|
23
|
+
INT_LIST_PROPERTIES = [:year, :season]
|
24
|
+
PROPERTY_ALIAS = {:genres => :genre,
|
25
|
+
:taglines => :tagline,
|
26
|
+
:year => :years,
|
27
|
+
:season => :seasons,
|
28
|
+
:language => :languages,
|
29
|
+
:motion_picture_rating_mpaa => :mpaa}
|
30
|
+
|
31
|
+
class << self
|
32
|
+
|
33
|
+
def search_for_imdb_id(name, year=nil, type=nil)
|
34
|
+
search_results = self.search_imdb(name)
|
35
|
+
|
36
|
+
search_results.each do |result|
|
37
|
+
# Ensure result is the correct video type
|
38
|
+
next if type && (result[:video_type] != type)
|
39
|
+
|
40
|
+
# If no year provided just return first result
|
41
|
+
return result[:imdb_id] if year.nil? || result[:year] == year
|
48
42
|
end
|
43
|
+
return nil
|
49
44
|
end
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
45
|
+
|
46
|
+
def search_imdb(search_term)
|
47
|
+
search_results = []
|
48
|
+
|
49
|
+
doc = self.get_search_page(search_term)
|
50
|
+
|
51
|
+
# If the search is an exact match imdb will redirect to the movie page not search results page
|
52
|
+
# we uses the title meta element to determine if we got an exact match
|
53
|
+
movie_title, movie_year = get_title_and_year_from_meta(doc)
|
54
|
+
if movie_title
|
55
|
+
canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href')
|
56
|
+
if canonical_link && canonical_link =~ /tt(\d+)\//
|
57
|
+
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => video_type_from_meta(doc)]
|
58
|
+
else
|
59
|
+
raise "Unable to extract imdb id from exact search result"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
doc.css("td").each do |td|
|
64
|
+
td.css("a").each do |link|
|
65
|
+
href = link['href']
|
66
|
+
current_name = link.content
|
67
|
+
|
68
|
+
# Ignore links with no text (e.g. image links) or links that don't link to movie pages
|
69
|
+
next unless current_name.present? && href =~ /^\/title\/tt(\d+)/
|
61
70
|
imdb_id = $1
|
62
71
|
current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
|
63
|
-
search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type =>
|
72
|
+
search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)}
|
64
73
|
end
|
65
74
|
end
|
66
|
-
end
|
67
|
-
|
68
|
-
return search_results
|
69
|
-
end
|
70
|
-
|
71
|
-
def self.scrap_movie_info(imdb_id)
|
72
|
-
info_hash = {:imdb_id => imdb_id}.with_indifferent_access
|
73
|
-
|
74
|
-
doc = self.get_movie_page(imdb_id)
|
75
|
-
info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
|
76
|
-
if info_hash['title'].nil?
|
77
|
-
#If we cant get title and year something is wrong
|
78
|
-
raise "Unable to find title or year for imdb id #{imdb_id}"
|
79
|
-
end
|
80
|
-
info_hash['video_type'] = self.video_type_from_meta(doc)
|
81
75
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
76
|
+
return search_results
|
77
|
+
end
|
78
|
+
|
79
|
+
def scrap_movie_info(imdb_id)
|
80
|
+
info_hash = {:imdb_id => imdb_id}.with_indifferent_access
|
81
|
+
|
82
|
+
doc = self.get_movie_page(imdb_id)
|
83
|
+
title, year = get_title_and_year_from_meta(doc)
|
84
|
+
info_hash[:title], info_hash[:year] = title, year
|
85
|
+
if info_hash['title'].nil?
|
86
|
+
#If we cant get title and year something is wrong
|
87
|
+
raise "Unable to find title or year for imdb id #{imdb_id}"
|
88
|
+
end
|
89
|
+
info_hash[:video_type] = self.video_type_from_meta(doc)
|
92
90
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
91
|
+
info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip
|
92
|
+
info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil
|
93
|
+
|
94
|
+
found_info_divs = false
|
95
|
+
movie_properties(doc) do |key, value|
|
96
|
+
found_info_divs = true
|
97
|
+
info_hash["raw_#{key}"] = value
|
98
|
+
info_hash[key] = clean_movie_property(key, value)
|
99
|
+
info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
|
100
|
+
end
|
101
|
+
|
102
|
+
if not found_info_divs
|
103
|
+
#If we don't find any info divs assume parsing failed
|
104
|
+
raise "No info divs found for imdb id #{imdb_id}"
|
105
|
+
end
|
106
|
+
|
107
|
+
# Hack: tv shows can have a year property, which is a list, fixing ...
|
108
|
+
info_hash[:year] = year
|
109
|
+
|
110
|
+
self.scrap_images(doc, info_hash)
|
111
|
+
|
112
|
+
#scrap episodes if tv series
|
113
|
+
if info_hash.has_key?('season')
|
114
|
+
self.scrap_episodes(info_hash)
|
115
|
+
end
|
116
|
+
|
117
|
+
return info_hash
|
118
|
+
end
|
119
|
+
|
120
|
+
def clean_movie_property(key, value)
|
121
|
+
if DATE_PROPERTIES.include?(key)
|
122
|
+
value = Date.strptime(value, '%d %B %Y') rescue nil
|
123
|
+
elsif key == :runtime
|
101
124
|
if value =~ /(\d+)\smin/
|
102
125
|
value = $1.to_i
|
103
126
|
else
|
104
|
-
|
127
|
+
value = nil
|
105
128
|
end
|
106
|
-
elsif key
|
129
|
+
elsif LIST_PROPERTIES.include?(key)
|
107
130
|
value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
|
108
|
-
|
109
|
-
info_hash[:genre] = value
|
110
|
-
elsif key == 'year'
|
131
|
+
elsif INT_LIST_PROPERTIES.include?(key)
|
111
132
|
value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 }
|
112
|
-
# TV shows can have multiple years
|
113
|
-
info_hash[:years] = value
|
114
|
-
value = value.sort.first
|
115
|
-
elsif key == 'language'
|
116
|
-
value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9]/, '') }
|
117
|
-
elsif key == 'taglines'
|
118
|
-
# Backwards compatibility
|
119
|
-
info_hash['tagline'] = value
|
120
|
-
elsif key == 'motion picture rating (mpaa)'
|
121
|
-
value = value.gsub(/See all certifications/, '').strip
|
122
|
-
# Backwards compatibility FIXME do with a map
|
123
|
-
info_hash['mpaa'] = value
|
124
133
|
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
if not found_info_divs
|
129
|
-
#If we don't find any info divs assume parsing failed
|
130
|
-
raise "No info divs found for imdb id #{imdb_id}"
|
131
|
-
end
|
132
|
-
|
133
|
-
self.scrap_images(doc, info_hash)
|
134
|
-
|
135
|
-
#scrap episodes if tv series
|
136
|
-
if info_hash.has_key?('season')
|
137
|
-
self.scrap_episodes(info_hash)
|
134
|
+
return value
|
138
135
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# Small thumbnail image, gotten by hacking medium url
|
153
|
-
info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
|
154
|
-
|
155
|
-
#Try to scrap a larger version of the image url
|
156
|
-
large_img_page = doc.xpath("//td[@id = 'img_primary']/a").first['href']
|
157
|
-
large_img_doc = self.get_media_page(large_img_page)
|
158
|
-
large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
|
159
|
-
info_hash['large_image'] = large_img_url
|
136
|
+
|
137
|
+
def movie_properties(doc)
|
138
|
+
doc.css("div h4").each do |h4|
|
139
|
+
div = h4.parent
|
140
|
+
raw_key = h4.inner_text
|
141
|
+
key = raw_key.sub(':', '').strip.downcase
|
142
|
+
value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
|
143
|
+
value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)$/, '').strip
|
144
|
+
|
145
|
+
symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
|
146
|
+
|
147
|
+
yield symbol_key, value
|
160
148
|
end
|
161
149
|
end
|
162
|
-
end
|
163
150
|
|
164
|
-
|
151
|
+
def scrap_images(doc, info_hash)
|
152
|
+
#scrap poster image urls
|
153
|
+
thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
|
154
|
+
return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\//
|
155
|
+
|
156
|
+
info_hash['medium_image'] = thumbnail_url
|
157
|
+
# Small thumbnail image, gotten by hacking medium url
|
158
|
+
info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg')
|
159
|
+
|
160
|
+
#Try to scrap a larger version of the image url
|
161
|
+
large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href')
|
162
|
+
return unless large_img_page_link
|
163
|
+
large_img_doc = get_media_page(large_img_page_link)
|
164
|
+
large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src')
|
165
|
+
info_hash['large_image'] = large_img_url
|
166
|
+
end
|
167
|
+
|
168
|
+
def scrap_episodes(info_hash)
|
165
169
|
episodes = []
|
166
170
|
doc = self.get_episodes_page(info_hash[:imdb_id])
|
167
|
-
|
168
|
-
|
169
|
-
|
171
|
+
|
172
|
+
doc.css(".filter-all").each do |e_div|
|
173
|
+
next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/
|
170
174
|
episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
episodes << episode
|
175
|
+
|
176
|
+
raw_date = e_div.at_css('strong').inner_text.strip
|
177
|
+
episode['date'] = Date.parse(raw_date) rescue nil
|
178
|
+
if e_div.inner_text =~ /#{raw_date}/
|
179
|
+
episode['plot'] = $'.strip
|
177
180
|
end
|
181
|
+
|
182
|
+
episodes << episode
|
178
183
|
end
|
179
184
|
info_hash['episodes'] = episodes
|
180
|
-
end
|
181
|
-
|
182
|
-
def self.get_search_page(name)
|
183
|
-
Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
184
185
|
end
|
185
186
|
|
186
|
-
|
187
|
-
|
188
|
-
|
187
|
+
def get_search_page(name)
|
188
|
+
Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
189
|
+
end
|
189
190
|
|
190
|
-
|
191
|
-
|
192
|
-
|
191
|
+
def get_movie_page(imdb_id)
|
192
|
+
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
|
193
|
+
end
|
193
194
|
|
194
|
-
|
195
|
-
|
196
|
-
|
195
|
+
def get_episodes_page(imdb_id)
|
196
|
+
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
|
197
|
+
end
|
198
|
+
|
199
|
+
def get_media_page(url_fragment)
|
200
|
+
Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment))
|
201
|
+
end
|
197
202
|
|
198
|
-
|
199
|
-
|
203
|
+
def get_title_and_year_from_meta(doc)
|
204
|
+
title_text = doc.at_css("meta[name='title']").try(:[], 'content')
|
205
|
+
# Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)'
|
206
|
+
if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/
|
207
|
+
movie_title = self.clean_title($1)
|
208
|
+
movie_year = $2.to_i
|
209
|
+
end
|
210
|
+
return movie_title, movie_year
|
211
|
+
end
|
212
|
+
|
213
|
+
# Remove surrounding double quotes that seems to appear on tv show name
|
214
|
+
def clean_title(movie_title)
|
215
|
+
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
216
|
+
return movie_title.strip
|
217
|
+
end
|
200
218
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
219
|
+
# Hackyness to get around ruby 1.9 encoding issue
|
220
|
+
def strip_whitespace(s)
|
221
|
+
s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
|
222
|
+
end
|
223
|
+
|
224
|
+
def video_type(td)
|
225
|
+
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
226
|
+
return :movie
|
227
|
+
end
|
228
|
+
|
229
|
+
def video_type_from_meta(doc)
|
230
|
+
type_text = doc.at_css("meta[property='og:type']").try(:[], 'content')
|
231
|
+
type_text == 'tv_show' ? :tv_show : :movie
|
208
232
|
end
|
209
|
-
return movie_title, movie_year
|
210
|
-
end
|
211
233
|
|
212
|
-
# Remove surrounding double quotes that seems to appear on tv show name
|
213
|
-
def self.clean_title(movie_title)
|
214
|
-
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
215
|
-
return movie_title.strip
|
216
|
-
end
|
217
|
-
|
218
|
-
# Hackyness to get around ruby 1.9 encoding issue
|
219
|
-
def self.strip_whitespace(s)
|
220
|
-
s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip
|
221
|
-
end
|
222
|
-
|
223
|
-
def self.video_type(td)
|
224
|
-
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
225
|
-
return :movie
|
226
|
-
end
|
227
|
-
|
228
|
-
def self.video_type_from_meta(doc)
|
229
|
-
meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
|
230
|
-
return :movie unless meta_type_tag.first
|
231
|
-
type_text = meta_type_tag.first['content']
|
232
|
-
case type_text
|
233
|
-
when 'tv_show' then return :tv_show
|
234
|
-
else return :movie
|
235
|
-
end
|
236
234
|
end
|
237
235
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Sam Cavenagh
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-06 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|