yayimdbs 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README.md +31 -0
  2. data/lib/yay_imdbs.rb +201 -0
  3. metadata +130 -0
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Yay IMDBs !
2
+
3
+ Overview
4
+ --------
5
+ Yet Another Ying IMDB Scraper
6
+
7
+ This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
8
+
9
+ Features
10
+ --------
11
+ * Basic search functionality, which can be limited based on: title, year and type (tv show or movie)
12
+ * Build on Nokogiri rather than Hpricot (I was having encoding issues in Hpricot on ruby 1.9.2)
13
+ * Support for scraping most info from a movie page
14
+ * Support for getting a thumbnail and a large image url
15
+ * Support for scraping tv show episodes
16
+
17
+ Installation
18
+ ------------
19
+ TODO
20
+
21
+ Examples
22
+ --------
23
+ TODO
24
+
25
+ Licence
26
+ -------
27
+ MIT
28
+
29
+ Contact
30
+ -------
31
+ Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
data/lib/yay_imdbs.rb ADDED
@@ -0,0 +1,201 @@
1
+ # encoding: utf-8
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'active_support/all'
5
+
6
+ class YayImdbs
7
+ IMDB_BASE_URL = 'http://www.imdb.com/'
8
+ IMDB_SEARCH_URL = IMDB_BASE_URL + 'find?s=tt&q='
9
+ IMDB_MOVIE_URL = IMDB_BASE_URL + 'title/tt'
10
+
11
+ STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/
12
+
13
+ def self.search_for_imdb_id(name, year, tv_series=false)
14
+ search_results = self.search_imdb(name)
15
+ return nil if search_results.empty?
16
+
17
+ search_results.each do |result|
18
+ # Ensure result is the correct video type
19
+ next if (result[:video_type] == :tv_show) != tv_series
20
+
21
+ # If no year provided just return first result
22
+ return result[:imdb_id] if !year || result[:year] == year
23
+ end
24
+ return nil
25
+ end
26
+
27
+ def self.search_imdb(search_term)
28
+ search_results = []
29
+
30
+ doc = self.get_search_page(search_term)
31
+ # If the search is an exact match imdb will redirect to the movie page not search results page
32
+ # we uses the the title meta element to determine if we got an exact match
33
+ movie_title, movie_year = get_title_and_year_from_meta(doc)
34
+ if movie_title
35
+ canonical_link = doc.xpath("//link[@rel='canonical']")
36
+ if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
37
+ return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
38
+ else
39
+ raise "Unable to extract imdb id from exact search result"
40
+ end
41
+ end
42
+
43
+ doc.xpath("//td").each do |td|
44
+ td.xpath(".//a").each do |link|
45
+ href = link['href']
46
+ current_name = link.content
47
+
48
+ # Ignore links with no text (e.g. image links)
49
+ next unless current_name.present?
50
+ current_name = self.clean_title(current_name)
51
+
52
+ if href =~ /^\/title\/tt(\d+)/
53
+ imdb_id = $1
54
+ current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
55
+ search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
56
+ end
57
+ end
58
+ end
59
+
60
+ return search_results
61
+ end
62
+
63
+ def self.scrap_movie_info(imdb_id)
64
+ info_hash = {}.with_indifferent_access
65
+
66
+ doc = self.get_movie_page(imdb_id)
67
+ info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
68
+ if info_hash['title'].nil?
69
+ #If we cant get title and year something is wrong
70
+ raise "Unable to find title or year for imdb id #{imdb_id}"
71
+ end
72
+ info_hash['video_type'] = self.video_type_from_meta(doc)
73
+
74
+ found_info_divs = false
75
+ doc.xpath("//div[@class='info']").each do |div|
76
+ next if div.xpath(".//h5").empty?
77
+ found_info_divs = true
78
+ key = div.xpath(".//h5").first.inner_text.sub(':', '').downcase
79
+ value_search = ".//div[@class = 'info-content']"
80
+ # Try to only get text values and ignore links as some info blocks have a "click for more info" type link at the end
81
+ value = div.xpath(value_search).first.children.map{|e| e.text? ? e.to_s : ''}.join.gsub(STRIP_WHITESPACE, '').strip
82
+ if value.empty?
83
+ value = div.xpath(value_search).first.content.gsub(STRIP_WHITESPACE, '')
84
+ end
85
+ if key == 'release date'
86
+ begin
87
+ value = Date.strptime(value, '%d %B %Y')
88
+ rescue
89
+ p "Invalid date '#{value}' for imdb id: #{imdb_id}"
90
+ value = nil
91
+ end
92
+ elsif key == 'runtime'
93
+ if value =~ /(\d+)\smin/
94
+ value = $1.to_i
95
+ else
96
+ p "Unexpected runtime format #{value} for movie #{imdb_id}"
97
+ end
98
+ elsif key == 'genre'
99
+ value = value.sub(/(See more$)|(more$)/, '').strip.split
100
+ elsif key == 'language'
101
+ # This is a bit of a hack, I dont really want to deal with multiple langauges, so if there is more than one
102
+ # just use english or the first one found
103
+ value = nil
104
+ div.xpath(value_search).first.inner_text.split(/\|/).collect {|l| l.strip}.each do |language|
105
+ value = language if value.nil?
106
+ value = language if language.downcase == 'english'
107
+ end
108
+ end
109
+ info_hash[key.downcase.gsub(/\s/, '_')] = value
110
+ end
111
+
112
+ if not found_info_divs
113
+ #If we don't find any info divs assume parsing failed
114
+ raise "No info divs found for imdb id #{imdb_id}"
115
+ end
116
+
117
+
118
+ #scrap poster image urls
119
+ thumb = doc.xpath("//div[@class = 'photo']/a/img")
120
+ if thumb
121
+ thumbnail_url = thumb.first['src']
122
+ if not thumbnail_url =~ /addposter.jpg$/
123
+ info_hash['small_image'] = thumbnail_url
124
+
125
+ #Try to scrap a larger version of the image url
126
+ large_img_page = doc.xpath("//div[@class = 'photo']/a").first['href']
127
+ large_img_doc = Nokogiri::HTML(open('http://www.imdb.com' + large_img_page))
128
+ large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
129
+ info_hash['large_image'] = large_img_url
130
+ end
131
+ end
132
+
133
+ #scrap episodes if tv series
134
+ if info_hash.has_key?('seasons')
135
+ episodes = []
136
+ doc = self.get_episodes_page(imdb_id)
137
+ episode_divs = doc.css(".filter-all")
138
+ episode_divs.each do |e_div|
139
+ if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
140
+ episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
141
+ if e_div.xpath(".//td").inner_text =~ /(\d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4})/
142
+ episode['date'] = Date.parse($1)
143
+ episode['plot'] = $'.strip
144
+ end
145
+ episodes << episode
146
+ end
147
+ end
148
+ info_hash['episodes'] = episodes
149
+ end
150
+
151
+ return info_hash
152
+ end
153
+
154
+ private
155
+ def self.get_search_page(name)
156
+ return Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
157
+ end
158
+
159
+ def self.get_movie_page(imdb_id)
160
+ return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
161
+ end
162
+
163
+ def self.get_episodes_page(imdb_id)
164
+ return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
165
+ end
166
+
167
+ def self.get_title_and_year_from_meta(doc)
168
+ return nil, nil unless doc.xpath("//meta[@name='title']").first
169
+
170
+ title_text = doc.xpath("//meta[@name='title']").first['content']
171
+ # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)'
172
+ if title_text =~ /(.*) \((\d{4})\/?\w*\)/
173
+ movie_title = $1
174
+ movie_year = $2.to_i
175
+
176
+ movie_title = self.clean_title(movie_title)
177
+ end
178
+ return movie_title, movie_year
179
+ end
180
+
181
+ # Remove surrounding double quotes that seems to appear on tv show name
182
+ def self.clean_title(movie_title)
183
+ movie_title = $1 if movie_title =~ /^"(.*)"$/
184
+ return movie_title.strip
185
+ end
186
+
187
+ def self.video_type(td)
188
+ return :tv_show if td.content =~ /\((TV series|TV)\)/
189
+ return :movie
190
+ end
191
+
192
+ def self.video_type_from_meta(doc)
193
+ meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
194
+ return :movie unless meta_type_tag.first
195
+ type_text = meta_type_tag.first['content']
196
+ case type_text
197
+ when 'tv_show' then return :tv_show
198
+ else return :movie
199
+ end
200
+ end
201
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yayimdbs
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 1
9
+ version: 0.1.1
10
+ platform: ruby
11
+ authors:
12
+ - Sam Cavenagh
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-07 00:00:00 +10:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: nokogiri
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: active_support
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :runtime
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: tzinfo
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ type: :runtime
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: i18n
61
+ prerelease: false
62
+ requirement: &id004 !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ type: :runtime
71
+ version_requirements: *id004
72
+ - !ruby/object:Gem::Dependency
73
+ name: rspec
74
+ prerelease: false
75
+ requirement: &id005 !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ type: :development
84
+ version_requirements: *id005
85
+ description:
86
+ email: cavenaghweb@hotmail.com
87
+ executables: []
88
+
89
+ extensions: []
90
+
91
+ extra_rdoc_files:
92
+ - README.md
93
+ files:
94
+ - README.md
95
+ - lib/yay_imdbs.rb
96
+ has_rdoc: true
97
+ homepage: http://github.com/o-sam-o/yayimdbs
98
+ licenses: []
99
+
100
+ post_install_message:
101
+ rdoc_options:
102
+ - --main
103
+ - README.md
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ required_rubygems_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project:
125
+ rubygems_version: 1.3.7
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Yet Another Ying IMDB Scraper
129
+ test_files: []
130
+