royw-imdb 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,77 @@
1
+
2
+ ImdbMovie Indiana Jones and the Last Crusade
3
+ - should query IMDB url
4
+ - should get the title
5
+ - should get director(s)
6
+ - should get the poster url
7
+ - should return an ImdbImage object
8
+ - should get the rating
9
+ - should get cast members
10
+ - should get the writers
11
+ - should get the year
12
+ - should get the release date
13
+ - should get the genres
14
+ - should get the plot
15
+ - should get the length
16
+ - should get the countries
17
+ - should get the languages
18
+ - should get the color
19
+ - should get the company
20
+ - should get some photos
21
+ - should get the tagline
22
+ - should get the aspect ratio
23
+
24
+ ImdbMovie Han robado una estrella
25
+ - should query IMDB url
26
+ - should get the title
27
+ - should get director(s)
28
+ - should not get the poster
29
+ - should get cast members
30
+ - should get the writers
31
+ - should get the release date
32
+ - should get the genres
33
+ - should not get the plot
34
+ - should get the length
35
+ - should get the countries
36
+ - should get the languages
37
+ - should not get the color
38
+ - should get the company
39
+ - should not get any photos
40
+
41
+ ImdbSearch search that returns multiple movies
42
+ - should query IMDB url
43
+
44
+ ImdbSearch search that returns multiple movies movies
45
+ - should be a collection of ImdbMovie instances
46
+ - should include 'Indiana Jones and the Last Crusade'
47
+ - should not have titles with HTML tags
48
+ - should not have duplicate movies
49
+
50
+ ImdbSearch search that redirects to the lone matching movie movies
51
+ - should be a collection containing a single ImdbMovie instance
52
+ - should have the correct ID
53
+ - should have the correct title
54
+
55
+ ImdbSearch searches that match on AKA title movies
56
+ - should have multiple movies
57
+ - should find id tt0127357
58
+ - should have only one movie from 1998
59
+
60
+ ImdbSearch searches that match on AKA title but without search_aka enabled movies
61
+ - should have multiple movies
62
+ - should have 2 movies from 1998
63
+
64
+ ImdbMovie Indiana Jones and the Last Crusade
65
+ - should query IMDB url
66
+ - should get the image
67
+
68
+ String unescape_html
69
+ - should convert & to &
70
+ - should convert ó to ó
71
+
72
+ String strip_tags
73
+ - should strip HTML tags
74
+
75
+ Finished in 5.222681 seconds
76
+
77
+ 53 examples, 0 failures
@@ -0,0 +1,14 @@
1
+ # == Synopsis
2
+ # add a mkdirs method to the File class
3
+ class File
4
+ ##
5
+ # make directories including any missing in the path
6
+ #
7
+ # @param [String] dirspec the path to make sure exists
8
+ def File.mkdirs(dirspec)
9
+ unless File.exists?(dirspec)
10
+ mkdirs(File.dirname(dirspec))
11
+ Dir.mkdir(dirspec)
12
+ end
13
+ end
14
+ end
data/lib/imdb.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'open-uri'
2
+ require 'date'
3
+ require 'cgi'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'chronic'
7
+
8
+ require File.dirname(__FILE__) + '/imdb/imdb_search'
9
+ require File.dirname(__FILE__) + '/imdb/imdb_movie'
10
+ require File.dirname(__FILE__) + '/imdb/imdb_image'
11
+ require File.dirname(__FILE__) + '/string_extensions'
12
+ require File.dirname(__FILE__) + '/file_extensions'
13
+ require File.dirname(__FILE__) + '/object_extensions'
@@ -0,0 +1,19 @@
1
+ # @imdb_movie.poster.should == 'http://ia.media-imdb.com/images/M/MV5BMTkzODA5ODYwOV5BMl5BanBnXkFtZTcwMjAyNDYyMQ@@._V1._SX216_SY316_.jpg'
2
+
3
+ class ImdbImage
4
+
5
+ attr_accessor :url
6
+
7
+ def initialize(url)
8
+ @url = File.join("http://www.imdb.com/", url)
9
+ end
10
+
11
+ def image
12
+ document.at("table#principal tr td img")['src'] rescue nil
13
+ end
14
+
15
+ def document
16
+ @document ||= Hpricot(open(self.url).read)
17
+ end
18
+
19
+ end
@@ -0,0 +1,224 @@
1
+ class ImdbMovie
2
+ include Comparable
3
+
4
+ attr_reader :id, :url#, :title
5
+
6
+ def initialize(id, title = nil)
7
+ @id = id
8
+ # @url = "http://www.imdb.com/title/tt#{@id}/"
9
+ @url = sprintf(ImdbMovie::url_format, @id.to_s)
10
+ @title = title
11
+ end
12
+
13
+ # this is intended to be stubed by rspec where it
14
+ # should return the path to the cached html file
15
+ # Note, the returned String should have one '%s'
16
+ # which will replaced by sprintf with @id.to_s
17
+ def self.url_format
18
+ 'http://www.imdb.com/title/tt%s/'
19
+ end
20
+
21
+ # this is intended to be stubed by rspec where it
22
+ # should return true.
23
+ def self.use_html_cache
24
+ false
25
+ end
26
+
27
+ # add comparator so Arrays containing ImdbMovie objects
28
+ # can use uniq()
29
+ def <=>(other)
30
+ @id <=> other.id
31
+ end
32
+
33
+ def title
34
+ if @title.nil?
35
+ @title = document.at("div#tn15title h1").innerHTML.split('<span>').first.strip.unescape_html rescue nil
36
+ end
37
+ @title
38
+ end
39
+
40
+ def directors
41
+ document.search("h5[text()^='Director'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
42
+ end
43
+
44
+ def poster_url
45
+ document.at("a[@name='poster']")['href'] rescue nil
46
+ end
47
+
48
+ def tiny_poster_url
49
+ document.at("a[@name='poster'] img")['src'] rescue nil
50
+ end
51
+
52
+ def poster
53
+ ImdbImage.new(poster_url) rescue nil
54
+ end
55
+
56
+ def rating
57
+ document.at("h5[text()='User Rating:'] ~ b").innerHTML.strip.unescape_html.split('/').first.to_f rescue nil
58
+ end
59
+
60
+ def cast_members
61
+ # document.search("table.cast td.nm a").map { |link| link.innerHTML.strip.unescape_html } rescue []
62
+ document.search("table.cast tr").inject([]) do |result, row|
63
+ a = row.search("td.nm a").innerHTML.strip.unescape_html
64
+ c = row.search("td.char a").innerHTML.strip.unescape_html
65
+ if c.empty?
66
+ c = row.search("td.char").innerHTML.strip.unescape_html
67
+ end
68
+ result << [a,c]
69
+ end
70
+ end
71
+
72
+ def writers
73
+ document.search("h5[text()^='Writer'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
74
+ end
75
+
76
+ def year
77
+ document.search('a[@href^="/Sections/Years/"]').innerHTML
78
+ end
79
+
80
+ def release_date
81
+ date = document.search("//h5[text()^='Release Date']/..").innerHTML[/^\d{1,2} \w+ \d{4}/]
82
+ Date.parse(Chronic.parse(date).strftime('%Y/%m/%d'))
83
+ rescue
84
+ nil
85
+ end
86
+
87
+ def genres
88
+ document.search("h5[text()='Genre:'] ~ a[@href*=/Sections/Genres/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
89
+ end
90
+
91
+ def plot
92
+ document.search("//h5[text()^='Plot']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
93
+ end
94
+
95
+ def tagline
96
+ document.search("//h5[text()^='Tagline']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
97
+ end
98
+
99
+ def aspect_ratio
100
+ document.search("//h5[text()^='Aspect Ratio']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
101
+ end
102
+
103
+ def length
104
+ document.search("//h5[text()^='Runtime']/..").innerHTML[/\d+ min/] rescue nil
105
+ end
106
+
107
+ def countries
108
+ document.search("h5[text()='Country:'] ~ a[@href*=/Sections/Countries/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
109
+ end
110
+
111
+ def languages
112
+ document.search("h5[text()='Language:'] ~ a[@href*=/Sections/Languages/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
113
+ end
114
+
115
+ def color
116
+ document.at("h5[text()='Color:'] ~ a[@href*=color-info']").innerHTML.strip.unescape_html rescue nil
117
+ end
118
+
119
+ def company
120
+ document.at("h5[text()='Company:'] ~ a[@href*=/company/']").innerHTML.strip.unescape_html rescue nil
121
+ end
122
+
123
+ def photos
124
+ document.search(".media_strip_thumb img").map { |img| img['src'] } rescue []
125
+ end
126
+
127
+ # def get_data
128
+ # update_title
129
+ # end
130
+
131
+ # def title2
132
+ # document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
133
+ # end
134
+
135
+ # return the raw title
136
+ def raw_title
137
+ document.at("h1").innerText
138
+ end
139
+
140
+ # is this a video game as indicated by a '(VG)' in the raw title?
141
+ def video_game?
142
+ raw_title =~ /\(VG\)/
143
+ end
144
+
145
+ # find the release year
146
+ # Note, this is needed because not all entries on IMDB have a full
147
+ # release date as parsed by release_date.
148
+ def release_year
149
+ document.search("//h5[text()^='Release Date']/..").innerHTML[/\d{4}/]
150
+ end
151
+
152
+ # return an Array of Strings containing AKA titles
153
+ def also_known_as
154
+ el = document.search("//h5[text()^='Also Known As:']/..").at('h5')
155
+ aka = []
156
+ while(!el.nil?)
157
+ aka << el.to_s unless el.elem?
158
+ el = el.next
159
+ end
160
+ aka.collect!{|a| a.gsub(/\([^\)]*\)/, '').strip}
161
+ aka.uniq!
162
+ aka.compact!
163
+ aka.select{|a| !a.empty?}
164
+ end
165
+
166
+ # The MPAA rating, i.e. "PG-13"
167
+ def mpaa
168
+ document.search("//h5[text()^='MPAA']/..").text.gsub('MPAA:', '').strip rescue nil
169
+ end
170
+
171
+ # older films may not have MPAA ratings but usually have a certification.
172
+ # return a hash with country abbreviations for keys and the certification string for the value
173
+ # example: {'USA' => 'Approved'}
174
+ def certifications
175
+ cert_hash = {}
176
+ certs = document.search("h5[text()='Certification:'] ~ a[@href*=/List?certificates']").map { |link| link.innerHTML.strip } rescue []
177
+ certs.each { |line| cert_hash[$1] = $2 if line =~ /(.*):(.*)/ }
178
+ cert_hash
179
+ end
180
+
181
+ private
182
+
183
+ # def update_title
184
+ # @title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
185
+ # #document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
186
+ # end
187
+
188
+ MAX_ATTEMPTS = 3
189
+ SECONDS_BETWEEN_RETRIES = 1.0
190
+
191
+ # Fetch the document with retry to handle the occasional glitches
192
+ def document
193
+ attempts = 0
194
+ begin
195
+ html = open(self.url).read
196
+ @document ||= Hpricot(html)
197
+ cache_html_files(html) if ImdbMovie::use_html_cache
198
+ rescue Exception => e
199
+ attempts += 1
200
+ if attempts > MAX_ATTEMPTS
201
+ raise
202
+ else
203
+ sleep SECONDS_BETWEEN_RETRIES
204
+ retry
205
+ end
206
+ end
207
+ @document
208
+ end
209
+
210
+ # this is used to save imdb pages so they may be used by rspec
211
+ def cache_html_files(html)
212
+ begin
213
+ filespec = self.url.gsub(/^http:\//, 'spec/samples').gsub(/\/$/, '.html')
214
+ unless File.exist?(filespec)
215
+ puts filespec
216
+ File.mkdirs(File.dirname(filespec))
217
+ File.open(filespec, 'w') { |f| f.puts html }
218
+ end
219
+ rescue Exception => eMsg
220
+ puts eMsg.to_s
221
+ end
222
+ end
223
+
224
+ end
@@ -0,0 +1,111 @@
1
+ class ImdbSearch
2
+
3
+ attr_reader :query
4
+
5
+ def initialize(query, search_akas=false)
6
+ @query = query
7
+ @search_akas = search_akas
8
+ end
9
+
10
+ def movies
11
+ @movies ||= parse_movies_from_document
12
+ end
13
+
14
+ # Find the IMDB ID for the current search title
15
+ # The find can be helped a lot by including a years option that contains
16
+ # an Array of integers that are the production year (plus/minus a year)
17
+ # and the release year.
18
+ def find_id(options={})
19
+ id = nil
20
+ found_movies = self.movies
21
+ unless found_movies.nil?
22
+ desired_movies = found_movies.select do |m|
23
+ aka = m.also_known_as
24
+ result = imdb_compare_titles(m.title, aka, @query) && !m.video_game? && !m.release_year.nil?
25
+ if result
26
+ unless options[:years].nil?
27
+ result = options[:years].include?(m.release_year.to_i)
28
+ end
29
+ end
30
+ result
31
+ end
32
+ ids = desired_movies.collect{|m| m.id}.uniq.compact
33
+ if ids.length == 1
34
+ id = "tt#{ids[0]}"
35
+ end
36
+ end
37
+ id
38
+ end
39
+
40
+ protected
41
+
42
+ # compare the imdb title and the imdb title's AKAs against the media title.
43
+ # note, on exact match lookups, IMDB will sometimes set the title to
44
+ # 'trailers and videos' instead of the correct title.
45
+ def imdb_compare_titles(imdb_title, aka_titles, media_title)
46
+ result = fuzzy_compare_titles(imdb_title, media_title)
47
+ unless result
48
+ result = fuzzy_compare_titles(imdb_title, 'trailers and videos')
49
+ unless result
50
+ aka_titles.each do |aka|
51
+ result = fuzzy_compare_titles(aka, media_title)
52
+ break if result
53
+ end
54
+ end
55
+ end
56
+ result
57
+ end
58
+
59
+ # a fuzzy compare that is case insensitive and replaces '&' with 'and'
60
+ # (because that is what IMDB occasionally does)
61
+ def fuzzy_compare_titles(title1, title2)
62
+ t1 = title1.downcase
63
+ t2 = title2.downcase
64
+ (t1 == t2) ||
65
+ (t1.gsub(/&/, 'and') == t2.gsub(/&/, 'and')) ||
66
+ (t1.gsub(/[-:]/, ' ') == t2.gsub(/[-:]/, ' ')) ||
67
+ (t1.gsub('more at imdbpro ?', '') == t2)
68
+ end
69
+
70
+ private
71
+
72
+ def document
73
+ filespec = "http://www.imdb.com/find?q=#{CGI::escape(@query)};s=tt"
74
+ @document ||= Hpricot(open(filespec).read)
75
+ end
76
+
77
+ def parse_movies_from_document
78
+ exact_match? ? parse_exact_match_search_results : parse_multi_movie_search_results
79
+ end
80
+
81
+ def parse_exact_match_search_results
82
+ id = document.at("a[@name='poster']")['href'][/\d+$/]
83
+ title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
84
+ [ImdbMovie.new(id, title)]
85
+ end
86
+
87
+ def parse_multi_movie_search_results
88
+ ids_and_titles = document.search('a[@href^="/title/tt"]').reject do |element|
89
+ element.innerHTML.strip_tags.empty?
90
+ end.map do |element|
91
+ [element['href'][/\d+/], element.innerHTML.strip_tags.unescape_html]
92
+ end.uniq
93
+
94
+ films = ids_and_titles.map do |id_and_title|
95
+ ImdbMovie.new(id_and_title[0], id_and_title[1])
96
+ end.uniq
97
+
98
+ if films.length > 1 && @search_akas
99
+ films = films.select do |m|
100
+ aka = m.also_known_as
101
+ imdb_compare_titles(m.title, aka, @query) && !m.video_game?
102
+ end
103
+ end
104
+ films
105
+ end
106
+
107
+ def exact_match?
108
+ document.search("title[text()='IMDb Title Search']").empty? && !document.search("a[@name='poster']").empty?
109
+ end
110
+
111
+ end
@@ -0,0 +1,19 @@
1
+ # == Synopsis
2
+ # add a blank? method to all Objects
3
+ class Object
4
+ # return asserted if object is nil or empty
5
+ # TODO: not the safest coding, probably should dup before stripping. Maybe should also compact
6
+ def blank?
7
+ result = nil?
8
+ unless result
9
+ if respond_to? 'empty?'
10
+ if respond_to? 'strip'
11
+ result = strip.empty?
12
+ else
13
+ result = empty?
14
+ end
15
+ end
16
+ end
17
+ result
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ require 'cgi'
2
+ require 'iconv'
3
+
4
+ module ImdbStringExtensions
5
+
6
+ def unescape_html
7
+ Iconv.conv("UTF-8", 'ISO-8859-1', CGI::unescapeHTML(self))
8
+ end
9
+
10
+ def strip_tags
11
+ gsub(/<\/?[^>]*>/, "")
12
+ end
13
+
14
+ end
15
+
16
+ String.send :include, ImdbStringExtensions
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: royw-imdb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.8
5
+ platform: ruby
6
+ authors:
7
+ - Sergio Gil
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-03-21 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: chronic
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description:
36
+ email: sgilperez@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/imdb/imdb_movie.rb
45
+ - lib/imdb/imdb_search.rb
46
+ - lib/imdb/imdb_image.rb
47
+ - lib/imdb.rb
48
+ - lib/file_extensions.rb
49
+ - lib/object_extensions.rb
50
+ - lib/string_extensions.rb
51
+ - README
52
+ has_rdoc: false
53
+ homepage:
54
+ post_install_message:
55
+ rdoc_options: []
56
+
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ version:
71
+ requirements: []
72
+
73
+ rubyforge_project:
74
+ rubygems_version: 1.2.0
75
+ signing_key:
76
+ specification_version: 2
77
+ summary: Internet Movie DataBase
78
+ test_files: []
79
+