yayimdbs 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +31 -0
- data/lib/yay_imdbs.rb +201 -0
- metadata +130 -0
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Yay IMDBs !
|
2
|
+
|
3
|
+
Overview
|
4
|
+
--------
|
5
|
+
Yet Another Ying IMDB Scraper
|
6
|
+
|
7
|
+
This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
|
8
|
+
|
9
|
+
Features
|
10
|
+
--------
|
11
|
+
* Basic search functionality, which can be limited based on: title, year and type (tv show or movie)
|
12
|
+
* Build on Nokogiri rather than Hpricot (I was having encoding issues in Hpricot on ruby 1.9.2)
|
13
|
+
* Support for scraping most info from a movie page
|
14
|
+
* Support for getting a thumbnail and a large image url
|
15
|
+
* Support for scraping tv show episodes
|
16
|
+
|
17
|
+
Installation
|
18
|
+
------------
|
19
|
+
TODO
|
20
|
+
|
21
|
+
Examples
|
22
|
+
--------
|
23
|
+
TODO
|
24
|
+
|
25
|
+
Licence
|
26
|
+
-------
|
27
|
+
MIT
|
28
|
+
|
29
|
+
Contact
|
30
|
+
-------
|
31
|
+
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
data/lib/yay_imdbs.rb
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'active_support/all'
|
5
|
+
|
6
|
+
class YayImdbs
|
7
|
+
IMDB_BASE_URL = 'http://www.imdb.com/'
|
8
|
+
IMDB_SEARCH_URL = IMDB_BASE_URL + 'find?s=tt&q='
|
9
|
+
IMDB_MOVIE_URL = IMDB_BASE_URL + 'title/tt'
|
10
|
+
|
11
|
+
STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/
|
12
|
+
|
13
|
+
def self.search_for_imdb_id(name, year, tv_series=false)
|
14
|
+
search_results = self.search_imdb(name)
|
15
|
+
return nil if search_results.empty?
|
16
|
+
|
17
|
+
search_results.each do |result|
|
18
|
+
# Ensure result is the correct video type
|
19
|
+
next if (result[:video_type] == :tv_show) != tv_series
|
20
|
+
|
21
|
+
# If no year provided just return first result
|
22
|
+
return result[:imdb_id] if !year || result[:year] == year
|
23
|
+
end
|
24
|
+
return nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.search_imdb(search_term)
|
28
|
+
search_results = []
|
29
|
+
|
30
|
+
doc = self.get_search_page(search_term)
|
31
|
+
# If the search is an exact match imdb will redirect to the movie page not search results page
|
32
|
+
# we uses the the title meta element to determine if we got an exact match
|
33
|
+
movie_title, movie_year = get_title_and_year_from_meta(doc)
|
34
|
+
if movie_title
|
35
|
+
canonical_link = doc.xpath("//link[@rel='canonical']")
|
36
|
+
if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
|
37
|
+
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
|
38
|
+
else
|
39
|
+
raise "Unable to extract imdb id from exact search result"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
doc.xpath("//td").each do |td|
|
44
|
+
td.xpath(".//a").each do |link|
|
45
|
+
href = link['href']
|
46
|
+
current_name = link.content
|
47
|
+
|
48
|
+
# Ignore links with no text (e.g. image links)
|
49
|
+
next unless current_name.present?
|
50
|
+
current_name = self.clean_title(current_name)
|
51
|
+
|
52
|
+
if href =~ /^\/title\/tt(\d+)/
|
53
|
+
imdb_id = $1
|
54
|
+
current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
|
55
|
+
search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
return search_results
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.scrap_movie_info(imdb_id)
|
64
|
+
info_hash = {}.with_indifferent_access
|
65
|
+
|
66
|
+
doc = self.get_movie_page(imdb_id)
|
67
|
+
info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
|
68
|
+
if info_hash['title'].nil?
|
69
|
+
#If we cant get title and year something is wrong
|
70
|
+
raise "Unable to find title or year for imdb id #{imdb_id}"
|
71
|
+
end
|
72
|
+
info_hash['video_type'] = self.video_type_from_meta(doc)
|
73
|
+
|
74
|
+
found_info_divs = false
|
75
|
+
doc.xpath("//div[@class='info']").each do |div|
|
76
|
+
next if div.xpath(".//h5").empty?
|
77
|
+
found_info_divs = true
|
78
|
+
key = div.xpath(".//h5").first.inner_text.sub(':', '').downcase
|
79
|
+
value_search = ".//div[@class = 'info-content']"
|
80
|
+
# Try to only get text values and ignore links as some info blocks have a "click for more info" type link at the end
|
81
|
+
value = div.xpath(value_search).first.children.map{|e| e.text? ? e.to_s : ''}.join.gsub(STRIP_WHITESPACE, '').strip
|
82
|
+
if value.empty?
|
83
|
+
value = div.xpath(value_search).first.content.gsub(STRIP_WHITESPACE, '')
|
84
|
+
end
|
85
|
+
if key == 'release date'
|
86
|
+
begin
|
87
|
+
value = Date.strptime(value, '%d %B %Y')
|
88
|
+
rescue
|
89
|
+
p "Invalid date '#{value}' for imdb id: #{imdb_id}"
|
90
|
+
value = nil
|
91
|
+
end
|
92
|
+
elsif key == 'runtime'
|
93
|
+
if value =~ /(\d+)\smin/
|
94
|
+
value = $1.to_i
|
95
|
+
else
|
96
|
+
p "Unexpected runtime format #{value} for movie #{imdb_id}"
|
97
|
+
end
|
98
|
+
elsif key == 'genre'
|
99
|
+
value = value.sub(/(See more$)|(more$)/, '').strip.split
|
100
|
+
elsif key == 'language'
|
101
|
+
# This is a bit of a hack, I dont really want to deal with multiple langauges, so if there is more than one
|
102
|
+
# just use english or the first one found
|
103
|
+
value = nil
|
104
|
+
div.xpath(value_search).first.inner_text.split(/\|/).collect {|l| l.strip}.each do |language|
|
105
|
+
value = language if value.nil?
|
106
|
+
value = language if language.downcase == 'english'
|
107
|
+
end
|
108
|
+
end
|
109
|
+
info_hash[key.downcase.gsub(/\s/, '_')] = value
|
110
|
+
end
|
111
|
+
|
112
|
+
if not found_info_divs
|
113
|
+
#If we don't find any info divs assume parsing failed
|
114
|
+
raise "No info divs found for imdb id #{imdb_id}"
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
#scrap poster image urls
|
119
|
+
thumb = doc.xpath("//div[@class = 'photo']/a/img")
|
120
|
+
if thumb
|
121
|
+
thumbnail_url = thumb.first['src']
|
122
|
+
if not thumbnail_url =~ /addposter.jpg$/
|
123
|
+
info_hash['small_image'] = thumbnail_url
|
124
|
+
|
125
|
+
#Try to scrap a larger version of the image url
|
126
|
+
large_img_page = doc.xpath("//div[@class = 'photo']/a").first['href']
|
127
|
+
large_img_doc = Nokogiri::HTML(open('http://www.imdb.com' + large_img_page))
|
128
|
+
large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
|
129
|
+
info_hash['large_image'] = large_img_url
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
#scrap episodes if tv series
|
134
|
+
if info_hash.has_key?('seasons')
|
135
|
+
episodes = []
|
136
|
+
doc = self.get_episodes_page(imdb_id)
|
137
|
+
episode_divs = doc.css(".filter-all")
|
138
|
+
episode_divs.each do |e_div|
|
139
|
+
if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
|
140
|
+
episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
|
141
|
+
if e_div.xpath(".//td").inner_text =~ /(\d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4})/
|
142
|
+
episode['date'] = Date.parse($1)
|
143
|
+
episode['plot'] = $'.strip
|
144
|
+
end
|
145
|
+
episodes << episode
|
146
|
+
end
|
147
|
+
end
|
148
|
+
info_hash['episodes'] = episodes
|
149
|
+
end
|
150
|
+
|
151
|
+
return info_hash
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
def self.get_search_page(name)
|
156
|
+
return Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.get_movie_page(imdb_id)
|
160
|
+
return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.get_episodes_page(imdb_id)
|
164
|
+
return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.get_title_and_year_from_meta(doc)
|
168
|
+
return nil, nil unless doc.xpath("//meta[@name='title']").first
|
169
|
+
|
170
|
+
title_text = doc.xpath("//meta[@name='title']").first['content']
|
171
|
+
# Matches 'Movie Name (2010)' or 'Movie Name (2010/I)'
|
172
|
+
if title_text =~ /(.*) \((\d{4})\/?\w*\)/
|
173
|
+
movie_title = $1
|
174
|
+
movie_year = $2.to_i
|
175
|
+
|
176
|
+
movie_title = self.clean_title(movie_title)
|
177
|
+
end
|
178
|
+
return movie_title, movie_year
|
179
|
+
end
|
180
|
+
|
181
|
+
# Remove surrounding double quotes that seems to appear on tv show name
|
182
|
+
def self.clean_title(movie_title)
|
183
|
+
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
184
|
+
return movie_title.strip
|
185
|
+
end
|
186
|
+
|
187
|
+
def self.video_type(td)
|
188
|
+
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
189
|
+
return :movie
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.video_type_from_meta(doc)
|
193
|
+
meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
|
194
|
+
return :movie unless meta_type_tag.first
|
195
|
+
type_text = meta_type_tag.first['content']
|
196
|
+
case type_text
|
197
|
+
when 'tv_show' then return :tv_show
|
198
|
+
else return :movie
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: yayimdbs
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Sam Cavenagh
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-07 00:00:00 +10:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: active_support
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: tzinfo
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: i18n
|
61
|
+
prerelease: false
|
62
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
type: :runtime
|
71
|
+
version_requirements: *id004
|
72
|
+
- !ruby/object:Gem::Dependency
|
73
|
+
name: rspec
|
74
|
+
prerelease: false
|
75
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
segments:
|
81
|
+
- 0
|
82
|
+
version: "0"
|
83
|
+
type: :development
|
84
|
+
version_requirements: *id005
|
85
|
+
description:
|
86
|
+
email: cavenaghweb@hotmail.com
|
87
|
+
executables: []
|
88
|
+
|
89
|
+
extensions: []
|
90
|
+
|
91
|
+
extra_rdoc_files:
|
92
|
+
- README.md
|
93
|
+
files:
|
94
|
+
- README.md
|
95
|
+
- lib/yay_imdbs.rb
|
96
|
+
has_rdoc: true
|
97
|
+
homepage: http://github.com/o-sam-o/yayimdbs
|
98
|
+
licenses: []
|
99
|
+
|
100
|
+
post_install_message:
|
101
|
+
rdoc_options:
|
102
|
+
- --main
|
103
|
+
- README.md
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project:
|
125
|
+
rubygems_version: 1.3.7
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: Yet Another Ying IMDB Scraper
|
129
|
+
test_files: []
|
130
|
+
|