yayimdbs 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README.md +31 -0
  2. data/lib/yay_imdbs.rb +201 -0
  3. metadata +130 -0
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Yay IMDBs !
2
+
3
+ Overview
4
+ --------
5
+ Yet Another Ying IMDB Scraper
6
+
7
+ This is a simple imdb scraper, that i created as part of my [onbox](http://github.com/o-sam-o/onbox) project. I have moved it out into it's own gem so i can share it across projects.
8
+
9
+ Features
10
+ --------
11
+ * Basic search functionality, which can be limited based on: title, year and type (tv show or movie)
12
+ * Build on Nokogiri rather than Hpricot (I was having encoding issues in Hpricot on ruby 1.9.2)
13
+ * Support for scraping most info from a movie page
14
+ * Support for getting a thumbnail and a large image url
15
+ * Support for scraping tv show episodes
16
+
17
+ Installation
18
+ ------------
19
+ TODO
20
+
21
+ Examples
22
+ --------
23
+ TODO
24
+
25
+ Licence
26
+ -------
27
+ MIT
28
+
29
+ Contact
30
+ -------
31
+ Sam Cavenagh [(cavenaghweb@hotmail.com)](mailto:cavenaghweb@hotmail.com)
data/lib/yay_imdbs.rb ADDED
@@ -0,0 +1,201 @@
1
+ # encoding: utf-8
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'active_support/all'
5
+
6
+ class YayImdbs
7
+ IMDB_BASE_URL = 'http://www.imdb.com/'
8
+ IMDB_SEARCH_URL = IMDB_BASE_URL + 'find?s=tt&q='
9
+ IMDB_MOVIE_URL = IMDB_BASE_URL + 'title/tt'
10
+
11
+ STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/
12
+
13
+ def self.search_for_imdb_id(name, year, tv_series=false)
14
+ search_results = self.search_imdb(name)
15
+ return nil if search_results.empty?
16
+
17
+ search_results.each do |result|
18
+ # Ensure result is the correct video type
19
+ next if (result[:video_type] == :tv_show) != tv_series
20
+
21
+ # If no year provided just return first result
22
+ return result[:imdb_id] if !year || result[:year] == year
23
+ end
24
+ return nil
25
+ end
26
+
27
+ def self.search_imdb(search_term)
28
+ search_results = []
29
+
30
+ doc = self.get_search_page(search_term)
31
+ # If the search is an exact match imdb will redirect to the movie page not search results page
32
+ # we uses the the title meta element to determine if we got an exact match
33
+ movie_title, movie_year = get_title_and_year_from_meta(doc)
34
+ if movie_title
35
+ canonical_link = doc.xpath("//link[@rel='canonical']")
36
+ if canonical_link && canonical_link.first['href'] =~ /tt(\d+)\//
37
+ return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => self.video_type_from_meta(doc)]
38
+ else
39
+ raise "Unable to extract imdb id from exact search result"
40
+ end
41
+ end
42
+
43
+ doc.xpath("//td").each do |td|
44
+ td.xpath(".//a").each do |link|
45
+ href = link['href']
46
+ current_name = link.content
47
+
48
+ # Ignore links with no text (e.g. image links)
49
+ next unless current_name.present?
50
+ current_name = self.clean_title(current_name)
51
+
52
+ if href =~ /^\/title\/tt(\d+)/
53
+ imdb_id = $1
54
+ current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/
55
+ search_results << {:imdb_id => imdb_id, :name => current_name, :year => current_year, :video_type => self.video_type(td)}
56
+ end
57
+ end
58
+ end
59
+
60
+ return search_results
61
+ end
62
+
63
+ def self.scrap_movie_info(imdb_id)
64
+ info_hash = {}.with_indifferent_access
65
+
66
+ doc = self.get_movie_page(imdb_id)
67
+ info_hash['title'], info_hash['year'] = get_title_and_year_from_meta(doc)
68
+ if info_hash['title'].nil?
69
+ #If we cant get title and year something is wrong
70
+ raise "Unable to find title or year for imdb id #{imdb_id}"
71
+ end
72
+ info_hash['video_type'] = self.video_type_from_meta(doc)
73
+
74
+ found_info_divs = false
75
+ doc.xpath("//div[@class='info']").each do |div|
76
+ next if div.xpath(".//h5").empty?
77
+ found_info_divs = true
78
+ key = div.xpath(".//h5").first.inner_text.sub(':', '').downcase
79
+ value_search = ".//div[@class = 'info-content']"
80
+ # Try to only get text values and ignore links as some info blocks have a "click for more info" type link at the end
81
+ value = div.xpath(value_search).first.children.map{|e| e.text? ? e.to_s : ''}.join.gsub(STRIP_WHITESPACE, '').strip
82
+ if value.empty?
83
+ value = div.xpath(value_search).first.content.gsub(STRIP_WHITESPACE, '')
84
+ end
85
+ if key == 'release date'
86
+ begin
87
+ value = Date.strptime(value, '%d %B %Y')
88
+ rescue
89
+ p "Invalid date '#{value}' for imdb id: #{imdb_id}"
90
+ value = nil
91
+ end
92
+ elsif key == 'runtime'
93
+ if value =~ /(\d+)\smin/
94
+ value = $1.to_i
95
+ else
96
+ p "Unexpected runtime format #{value} for movie #{imdb_id}"
97
+ end
98
+ elsif key == 'genre'
99
+ value = value.sub(/(See more$)|(more$)/, '').strip.split
100
+ elsif key == 'language'
101
+ # This is a bit of a hack, I dont really want to deal with multiple langauges, so if there is more than one
102
+ # just use english or the first one found
103
+ value = nil
104
+ div.xpath(value_search).first.inner_text.split(/\|/).collect {|l| l.strip}.each do |language|
105
+ value = language if value.nil?
106
+ value = language if language.downcase == 'english'
107
+ end
108
+ end
109
+ info_hash[key.downcase.gsub(/\s/, '_')] = value
110
+ end
111
+
112
+ if not found_info_divs
113
+ #If we don't find any info divs assume parsing failed
114
+ raise "No info divs found for imdb id #{imdb_id}"
115
+ end
116
+
117
+
118
+ #scrap poster image urls
119
+ thumb = doc.xpath("//div[@class = 'photo']/a/img")
120
+ if thumb
121
+ thumbnail_url = thumb.first['src']
122
+ if not thumbnail_url =~ /addposter.jpg$/
123
+ info_hash['small_image'] = thumbnail_url
124
+
125
+ #Try to scrap a larger version of the image url
126
+ large_img_page = doc.xpath("//div[@class = 'photo']/a").first['href']
127
+ large_img_doc = Nokogiri::HTML(open('http://www.imdb.com' + large_img_page))
128
+ large_img_url = large_img_doc.xpath("//img[@id = 'primary-img']").first['src'] unless large_img_doc.xpath("//img[@id = 'primary-img']").empty?
129
+ info_hash['large_image'] = large_img_url
130
+ end
131
+ end
132
+
133
+ #scrap episodes if tv series
134
+ if info_hash.has_key?('seasons')
135
+ episodes = []
136
+ doc = self.get_episodes_page(imdb_id)
137
+ episode_divs = doc.css(".filter-all")
138
+ episode_divs.each do |e_div|
139
+ if e_div.xpath('.//h3').inner_text =~ /Season (\d+), Episode (\d+):/
140
+ episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip}
141
+ if e_div.xpath(".//td").inner_text =~ /(\d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4})/
142
+ episode['date'] = Date.parse($1)
143
+ episode['plot'] = $'.strip
144
+ end
145
+ episodes << episode
146
+ end
147
+ end
148
+ info_hash['episodes'] = episodes
149
+ end
150
+
151
+ return info_hash
152
+ end
153
+
154
+ private
155
+ def self.get_search_page(name)
156
+ return Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name)))
157
+ end
158
+
159
+ def self.get_movie_page(imdb_id)
160
+ return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
161
+ end
162
+
163
+ def self.get_episodes_page(imdb_id)
164
+ return Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
165
+ end
166
+
167
+ def self.get_title_and_year_from_meta(doc)
168
+ return nil, nil unless doc.xpath("//meta[@name='title']").first
169
+
170
+ title_text = doc.xpath("//meta[@name='title']").first['content']
171
+ # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)'
172
+ if title_text =~ /(.*) \((\d{4})\/?\w*\)/
173
+ movie_title = $1
174
+ movie_year = $2.to_i
175
+
176
+ movie_title = self.clean_title(movie_title)
177
+ end
178
+ return movie_title, movie_year
179
+ end
180
+
181
+ # Remove surrounding double quotes that seems to appear on tv show name
182
+ def self.clean_title(movie_title)
183
+ movie_title = $1 if movie_title =~ /^"(.*)"$/
184
+ return movie_title.strip
185
+ end
186
+
187
+ def self.video_type(td)
188
+ return :tv_show if td.content =~ /\((TV series|TV)\)/
189
+ return :movie
190
+ end
191
+
192
+ def self.video_type_from_meta(doc)
193
+ meta_type_tag = doc.xpath("//meta[contains(@property,'type')]")
194
+ return :movie unless meta_type_tag.first
195
+ type_text = meta_type_tag.first['content']
196
+ case type_text
197
+ when 'tv_show' then return :tv_show
198
+ else return :movie
199
+ end
200
+ end
201
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yayimdbs
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 1
9
+ version: 0.1.1
10
+ platform: ruby
11
+ authors:
12
+ - Sam Cavenagh
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-07 00:00:00 +10:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: nokogiri
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: active_support
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :runtime
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: tzinfo
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ type: :runtime
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: i18n
61
+ prerelease: false
62
+ requirement: &id004 !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ type: :runtime
71
+ version_requirements: *id004
72
+ - !ruby/object:Gem::Dependency
73
+ name: rspec
74
+ prerelease: false
75
+ requirement: &id005 !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ type: :development
84
+ version_requirements: *id005
85
+ description:
86
+ email: cavenaghweb@hotmail.com
87
+ executables: []
88
+
89
+ extensions: []
90
+
91
+ extra_rdoc_files:
92
+ - README.md
93
+ files:
94
+ - README.md
95
+ - lib/yay_imdbs.rb
96
+ has_rdoc: true
97
+ homepage: http://github.com/o-sam-o/yayimdbs
98
+ licenses: []
99
+
100
+ post_install_message:
101
+ rdoc_options:
102
+ - --main
103
+ - README.md
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ required_rubygems_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project:
125
+ rubygems_version: 1.3.7
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Yet Another Ying IMDB Scraper
129
+ test_files: []
130
+