yayimdbs 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +31 -0
- data/lib/yay_imdbs.rb +201 -0
- metadata +130 -0
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Yay IMDBs !
|
2
|
+
|
3
|
+
Overview
|
4
|
+
--------
|
5
|
+
Yet Another Ying IMDB Scraper
|
6
|
+
|
7
|
+
This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
|
8
|
+
|
9
|
+
Features
|
10
|
+
--------
|
11
|
+
* Basic search functionality, which can be limited based on: title, year and type (tv show or movie)
|
12
|
+
* Build on Nokogiri rather than Hpricot (I was having encoding issues in Hpricot on ruby 1.9.2)
|
13
|
+
* Support for scraping most info from a movie page
|
14
|
+
* Support for getting a thumbnail and a large image url
|
15
|
+
* Support for scraping tv show episodes
|
16
|
+
|
17
|
+
Installation
|
18
|
+
------------
|
19
|
+
TODO
|
20
|
+
|
21
|
+
Examples
|
22
|
+
--------
|
23
|
+
TODO
|
24
|
+
|
25
|
+
Licence
|
26
|
+
-------
|
27
|
+
MIT
|
28
|
+
|
29
|
+
Contact
|
30
|
+
-------
|
31
|
+
Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
|
data/lib/yay_imdbs.rb
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'active_support/all'
|
5
|
+
|
6
|
+
class YayImdbs
|
7
|
+
IMDB_BASE_URL = 'http://www.imdb.com/'
|
8
|
+
IMDB_SEARCH_URL = IMDB_BASE_URL + 'find?s=tt&q='
|
9
|
+
IMDB_MOVIE_URL = IMDB_BASE_URL + 'title/tt'
|
10
|
+
|
11
|
+
STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/
|
12
|
+
|
13
|
+
def self.search_for_imdb_id(name, year, tv_series=false)
|
14
|
+
search_results = self.search_imdb(name)
|
15
|
+
return nil if search_results.empty?
|
16
|
+
|
17
|
+
search_results.each do |result|
|
18
|
+
# Ensure result is the correct video type
|
19
|
+
next if (result[:video_type] == :tv_show) != tv_series
|
20
|
+
|
21
|
+
# If no year provided just return first result
|
22
|
+
return result[:imdb_id] if !year || result[:year] == year
|
23
|
+
end
|
24
|
+
return nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.search_imdb(search_term)
|
28
|
+
search_results = []
|
29
|
+
|
30
|
+
doc = self.get_search_page(search_term)
|
31
|
+
# If the search is an exact match imdb will redirect to the movie page not search results page
|
32
|
+
# we uses the the title meta element to determine if we got an exact match
|
33
|
+
movie_title, movie_year = get_title_and_year_from_meta(doc)
|
34
|
+
if movie_title
|
35
|
+
canonical_link = doc.xpath("//link[@rel='canonical']")
|
36
|
+
if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
|
37
|
+
return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
|
38
|
+
else
|
39
|
+
raise "Unable to extract imdb id from exact search result"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
doc.xpath("//td").each do |td|
|
44
|
+
td.xpath(".//a").each do |link|
|
45
|
+
href = link['href']
|
46
|
+
current_name = link.content
|
47
|
+
|
48
|
+
# Ignore links with no text (e.g. image links)
|
49
|
+
next unless current_name.present?
|
50
|
+
current_name = self.clean_title(current_name)
|
51
|
+
|
52
|
+
if href =~ /^\/title\/tt(\d+)/
|
53
|
+
imdb_id = $1
|
54
|
+
current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
|
55
|
+
search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
return search_results
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.scrap_movie_info(imdb_id)
|
64
|
+
info_hash = {}.with_indifferent_access
|
65
|
+
|
66
|
+
doc = self.get_movie_page(imdb_id)
|
67
|
+
info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
|
68
|
+
if info_hash['title'].nil?
|
69
|
+
#If we cant get title and year something is wrong
|
70
|
+
raise "Unable to find title or year for imdb id #{imdb_id}"
|
71
|
+
end
|
72
|
+
info_hash['video_type'] = self.video_type_from_meta(doc)
|
73
|
+
|
74
|
+
found_info_divs = false
|
75
|
+
doc.xpath("//div[@class='info']").each do |div|
|
76
|
+
next if div.xpath(".//h5").empty?
|
77
|
+
found_info_divs = true
|
78
|
+
key = div.xpath(".//h5").first.inner_text.sub(':', '').downcase
|
79
|
+
value_search = ".//div[@class = 'info-content']"
|
80
|
+
# Try to only get text values and ignore links as some info blocks have a "click for more info" type link at the end
|
81
|
+
value = div.xpath(value_search).first.children.map{|e| e.text? ? e.to_s : ''}.join.gsub(STRIP_WHITESPACE, '').strip
|
82
|
+
if value.empty?
|
83
|
+
value = div.xpath(value_search).first.content.gsub(STRIP_WHITESPACE, '')
|
84
|
+
end
|
85
|
+
if key == 'release date'
|
86
|
+
begin
|
87
|
+
value = Date.strptime(value, '%d %B %Y')
|
88
|
+
rescue
|
89
|
+
p "Invalid date '#{value}' for imdb id: #{imdb_id}"
|
90
|
+
value = nil
|
91
|
+
end
|
92
|
+
elsif key == 'runtime'
|
93
|
+
if value =~ /(\d+)\smin/
|
94
|
+
value = $1.to_i
|
95
|
+
else
|
96
|
+
p "Unexpected runtime format #{value} for movie #{imdb_id}"
|
97
|
+
end
|
98
|
+
elsif key == 'genre'
|
99
|
+
value = value.sub(/(See more$)|(more$)/, '').strip.split
|
100
|
+
elsif key == 'language'
|
101
|
+
# This is a bit of a hack, I dont really want to deal with multiple langauges, so if there is more than one
|
102
|
+
# just use english or the first one found
|
103
|
+
value = nil
|
104
|
+
div.xpath(value_search).first.inner_text.split(/\|/).collect {|l| l.strip}.each do |language|
|
105
|
+
value = language if value.nil?
|
106
|
+
value = language if language.downcase == 'english'
|
107
|
+
end
|
108
|
+
end
|
109
|
+
info_hash[key.downcase.gsub(/\s/, '_')] = value
|
110
|
+
end
|
111
|
+
|
112
|
+
if not found_info_divs
|
113
|
+
#If we don't find any info divs assume parsing failed
|
114
|
+
raise "No info divs found for imdb id #{imdb_id}"
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
#scrap poster image urls
|
119
|
+
thumb = doc.xpath("//div[@class = 'photo']/a/img")
|
120
|
+
if thumb
|
121
|
+
thumbnail_url = thumb.first['src']
|
122
|
+
if not thumbnail_url =~ /addposter.jpg$/
|
123
|
+
info_hash['small_image'] = thumbnail_url
|
124
|
+
|
125
|
+
#Try to scrap a larger version of the image url
|
126
|
+
large_img_page = doc.xpath("//div[@class = 'photo']/a").first['href']
|
127
|
+
large_img_doc = Nokogiri::HTML(open('http://www.imdb.com' + large_img_page))
|
128
|
+
large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
|
129
|
+
info_hash['large_image'] = large_img_url
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
#scrap episodes if tv series
|
134
|
+
if info_hash.has_key?('seasons')
|
135
|
+
episodes = []
|
136
|
+
doc = self.get_episodes_page(imdb_id)
|
137
|
+
episode_divs = doc.css(".filter-all")
|
138
|
+
episode_divs.each do |e_div|
|
139
|
+
if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
|
140
|
+
episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
|
141
|
+
if e_div.xpath(".//td").inner_text =~ /(\d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4})/
|
142
|
+
episode['date'] = Date.parse($1)
|
143
|
+
episode['plot'] = $'.strip
|
144
|
+
end
|
145
|
+
episodes << episode
|
146
|
+
end
|
147
|
+
end
|
148
|
+
info_hash['episodes'] = episodes
|
149
|
+
end
|
150
|
+
|
151
|
+
return info_hash
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
def self.get_search_page(name)
|
156
|
+
return Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.get_movie_page(imdb_id)
|
160
|
+
return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.get_episodes_page(imdb_id)
|
164
|
+
return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.get_title_and_year_from_meta(doc)
|
168
|
+
return nil, nil unless doc.xpath("//meta[@name='title']").first
|
169
|
+
|
170
|
+
title_text = doc.xpath("//meta[@name='title']").first['content']
|
171
|
+
# Matches 'Movie Name (2010)' or 'Movie Name (2010/I)'
|
172
|
+
if title_text =~ /(.*) \((\d{4})\/?\w*\)/
|
173
|
+
movie_title = $1
|
174
|
+
movie_year = $2.to_i
|
175
|
+
|
176
|
+
movie_title = self.clean_title(movie_title)
|
177
|
+
end
|
178
|
+
return movie_title, movie_year
|
179
|
+
end
|
180
|
+
|
181
|
+
# Remove surrounding double quotes that seems to appear on tv show name
|
182
|
+
def self.clean_title(movie_title)
|
183
|
+
movie_title = $1 if movie_title =~ /^"(.*)"$/
|
184
|
+
return movie_title.strip
|
185
|
+
end
|
186
|
+
|
187
|
+
def self.video_type(td)
|
188
|
+
return :tv_show if td.content =~ /\((TV series|TV)\)/
|
189
|
+
return :movie
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.video_type_from_meta(doc)
|
193
|
+
meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
|
194
|
+
return :movie unless meta_type_tag.first
|
195
|
+
type_text = meta_type_tag.first['content']
|
196
|
+
case type_text
|
197
|
+
when 'tv_show' then return :tv_show
|
198
|
+
else return :movie
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: yayimdbs
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Sam Cavenagh
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-07 00:00:00 +10:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: active_support
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: tzinfo
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: i18n
|
61
|
+
prerelease: false
|
62
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
type: :runtime
|
71
|
+
version_requirements: *id004
|
72
|
+
- !ruby/object:Gem::Dependency
|
73
|
+
name: rspec
|
74
|
+
prerelease: false
|
75
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
segments:
|
81
|
+
- 0
|
82
|
+
version: "0"
|
83
|
+
type: :development
|
84
|
+
version_requirements: *id005
|
85
|
+
description:
|
86
|
+
email: cavenaghweb@hotmail.com
|
87
|
+
executables: []
|
88
|
+
|
89
|
+
extensions: []
|
90
|
+
|
91
|
+
extra_rdoc_files:
|
92
|
+
- README.md
|
93
|
+
files:
|
94
|
+
- README.md
|
95
|
+
- lib/yay_imdbs.rb
|
96
|
+
has_rdoc: true
|
97
|
+
homepage: http://github.com/o-sam-o/yayimdbs
|
98
|
+
licenses: []
|
99
|
+
|
100
|
+
post_install_message:
|
101
|
+
rdoc_options:
|
102
|
+
- --main
|
103
|
+
- README.md
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project:
|
125
|
+
rubygems_version: 1.3.7
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: Yet Another Ying IMDB Scraper
|
129
|
+
test_files: []
|
130
|
+
|