royw-imdb 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,77 @@
1
+
2
+ ImdbMovie Indiana Jones and the Last Crusade
3
+ - should query IMDB url
4
+ - should get the title
5
+ - should get director(s)
6
+ - should get the poster url
7
+ - should return an ImdbImage object
8
+ - should get the rating
9
+ - should get cast members
10
+ - should get the writers
11
+ - should get the year
12
+ - should get the release date
13
+ - should get the genres
14
+ - should get the plot
15
+ - should get the length
16
+ - should get the countries
17
+ - should get the languages
18
+ - should get the color
19
+ - should get the company
20
+ - should get some photos
21
+ - should get the tagline
22
+ - should get the aspect ratio
23
+
24
+ ImdbMovie Han robado una estrella
25
+ - should query IMDB url
26
+ - should get the title
27
+ - should get director(s)
28
+ - should not get the poster
29
+ - should get cast members
30
+ - should get the writers
31
+ - should get the release date
32
+ - should get the genres
33
+ - should not get the plot
34
+ - should get the length
35
+ - should get the countries
36
+ - should get the languages
37
+ - should not get the color
38
+ - should get the company
39
+ - should not get any photos
40
+
41
+ ImdbSearch search that returns multiple movies
42
+ - should query IMDB url
43
+
44
+ ImdbSearch search that returns multiple movies movies
45
+ - should be a collection of ImdbMovie instances
46
+ - should include 'Indiana Jones and the Last Crusade'
47
+ - should not have titles with HTML tags
48
+ - should not have duplicate movies
49
+
50
+ ImdbSearch search that redirects to the lone matching movie movies
51
+ - should be a collection containing a single ImdbMovie instance
52
+ - should have the correct ID
53
+ - should have the correct title
54
+
55
+ ImdbSearch searches that match on AKA title movies
56
+ - should have multiple movies
57
+ - should find id tt0127357
58
+ - should have only one movie from 1998
59
+
60
+ ImdbSearch searches that match on AKA title but without search_aka enabled movies
61
+ - should have multiple movies
62
+ - should have 2 movies from 1998
63
+
64
+ ImdbMovie Indiana Jones and the Last Crusade
65
+ - should query IMDB url
66
+ - should get the image
67
+
68
+ String unescape_html
69
+ - should convert & to &
70
+ - should convert ó to ó
71
+
72
+ String strip_tags
73
+ - should strip HTML tags
74
+
75
+ Finished in 5.222681 seconds
76
+
77
+ 53 examples, 0 failures
@@ -0,0 +1,14 @@
1
+ # == Synopsis
2
+ # add a mkdirs method to the File class
3
+ class File
4
+ ##
5
+ # make directories including any missing in the path
6
+ #
7
+ # @param [String] dirspec the path to make sure exists
8
+ def File.mkdirs(dirspec)
9
+ unless File.exists?(dirspec)
10
+ mkdirs(File.dirname(dirspec))
11
+ Dir.mkdir(dirspec)
12
+ end
13
+ end
14
+ end
data/lib/imdb.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'open-uri'
2
+ require 'date'
3
+ require 'cgi'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'chronic'
7
+
8
+ require File.dirname(__FILE__) + '/imdb/imdb_search'
9
+ require File.dirname(__FILE__) + '/imdb/imdb_movie'
10
+ require File.dirname(__FILE__) + '/imdb/imdb_image'
11
+ require File.dirname(__FILE__) + '/string_extensions'
12
+ require File.dirname(__FILE__) + '/file_extensions'
13
+ require File.dirname(__FILE__) + '/object_extensions'
@@ -0,0 +1,19 @@
1
+ # @imdb_movie.poster.should == 'http://ia.media-imdb.com/images/M/MV5BMTkzODA5ODYwOV5BMl5BanBnXkFtZTcwMjAyNDYyMQ@@._V1._SX216_SY316_.jpg'
2
+
3
+ class ImdbImage
4
+
5
+ attr_accessor :url
6
+
7
+ def initialize(url)
8
+ @url = File.join("http://www.imdb.com/", url)
9
+ end
10
+
11
+ def image
12
+ document.at("table#principal tr td img")['src'] rescue nil
13
+ end
14
+
15
+ def document
16
+ @document ||= Hpricot(open(self.url).read)
17
+ end
18
+
19
+ end
@@ -0,0 +1,224 @@
1
+ class ImdbMovie
2
+ include Comparable
3
+
4
+ attr_reader :id, :url#, :title
5
+
6
+ def initialize(id, title = nil)
7
+ @id = id
8
+ # @url = "http://www.imdb.com/title/tt#{@id}/"
9
+ @url = sprintf(ImdbMovie::url_format, @id.to_s)
10
+ @title = title
11
+ end
12
+
13
+ # this is intended to be stubed by rspec where it
14
+ # should return the path to the cached html file
15
+ # Note, the returned String should have one '%s'
16
+ # which will replaced by sprintf with @id.to_s
17
+ def self.url_format
18
+ 'http://www.imdb.com/title/tt%s/'
19
+ end
20
+
21
+ # this is intended to be stubed by rspec where it
22
+ # should return true.
23
+ def self.use_html_cache
24
+ false
25
+ end
26
+
27
+ # add comparator so Arrays containing ImdbMovie objects
28
+ # can use uniq()
29
+ def <=>(other)
30
+ @id <=> other.id
31
+ end
32
+
33
+ def title
34
+ if @title.nil?
35
+ @title = document.at("div#tn15title h1").innerHTML.split('<span>').first.strip.unescape_html rescue nil
36
+ end
37
+ @title
38
+ end
39
+
40
+ def directors
41
+ document.search("h5[text()^='Director'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
42
+ end
43
+
44
+ def poster_url
45
+ document.at("a[@name='poster']")['href'] rescue nil
46
+ end
47
+
48
+ def tiny_poster_url
49
+ document.at("a[@name='poster'] img")['src'] rescue nil
50
+ end
51
+
52
+ def poster
53
+ ImdbImage.new(poster_url) rescue nil
54
+ end
55
+
56
+ def rating
57
+ document.at("h5[text()='User Rating:'] ~ b").innerHTML.strip.unescape_html.split('/').first.to_f rescue nil
58
+ end
59
+
60
+ def cast_members
61
+ # document.search("table.cast td.nm a").map { |link| link.innerHTML.strip.unescape_html } rescue []
62
+ document.search("table.cast tr").inject([]) do |result, row|
63
+ a = row.search("td.nm a").innerHTML.strip.unescape_html
64
+ c = row.search("td.char a").innerHTML.strip.unescape_html
65
+ if c.empty?
66
+ c = row.search("td.char").innerHTML.strip.unescape_html
67
+ end
68
+ result << [a,c]
69
+ end
70
+ end
71
+
72
+ def writers
73
+ document.search("h5[text()^='Writer'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue []
74
+ end
75
+
76
+ def year
77
+ document.search('a[@href^="/Sections/Years/"]').innerHTML
78
+ end
79
+
80
+ def release_date
81
+ date = document.search("//h5[text()^='Release Date']/..").innerHTML[/^\d{1,2} \w+ \d{4}/]
82
+ Date.parse(Chronic.parse(date).strftime('%Y/%m/%d'))
83
+ rescue
84
+ nil
85
+ end
86
+
87
+ def genres
88
+ document.search("h5[text()='Genre:'] ~ a[@href*=/Sections/Genres/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
89
+ end
90
+
91
+ def plot
92
+ document.search("//h5[text()^='Plot']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
93
+ end
94
+
95
+ def tagline
96
+ document.search("//h5[text()^='Tagline']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
97
+ end
98
+
99
+ def aspect_ratio
100
+ document.search("//h5[text()^='Aspect Ratio']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil
101
+ end
102
+
103
+ def length
104
+ document.search("//h5[text()^='Runtime']/..").innerHTML[/\d+ min/] rescue nil
105
+ end
106
+
107
+ def countries
108
+ document.search("h5[text()='Country:'] ~ a[@href*=/Sections/Countries/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
109
+ end
110
+
111
+ def languages
112
+ document.search("h5[text()='Language:'] ~ a[@href*=/Sections/Languages/']").map { |link| link.innerHTML.strip.unescape_html } rescue []
113
+ end
114
+
115
+ def color
116
+ document.at("h5[text()='Color:'] ~ a[@href*=color-info']").innerHTML.strip.unescape_html rescue nil
117
+ end
118
+
119
+ def company
120
+ document.at("h5[text()='Company:'] ~ a[@href*=/company/']").innerHTML.strip.unescape_html rescue nil
121
+ end
122
+
123
+ def photos
124
+ document.search(".media_strip_thumb img").map { |img| img['src'] } rescue []
125
+ end
126
+
127
+ # def get_data
128
+ # update_title
129
+ # end
130
+
131
+ # def title2
132
+ # document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
133
+ # end
134
+
135
+ # return the raw title
136
+ def raw_title
137
+ document.at("h1").innerText
138
+ end
139
+
140
+ # is this a video game as indicated by a '(VG)' in the raw title?
141
+ def video_game?
142
+ raw_title =~ /\(VG\)/
143
+ end
144
+
145
+ # find the release year
146
+ # Note, this is needed because not all entries on IMDB have a full
147
+ # release date as parsed by release_date.
148
+ def release_year
149
+ document.search("//h5[text()^='Release Date']/..").innerHTML[/\d{4}/]
150
+ end
151
+
152
+ # return an Array of Strings containing AKA titles
153
+ def also_known_as
154
+ el = document.search("//h5[text()^='Also Known As:']/..").at('h5')
155
+ aka = []
156
+ while(!el.nil?)
157
+ aka << el.to_s unless el.elem?
158
+ el = el.next
159
+ end
160
+ aka.collect!{|a| a.gsub(/\([^\)]*\)/, '').strip}
161
+ aka.uniq!
162
+ aka.compact!
163
+ aka.select{|a| !a.empty?}
164
+ end
165
+
166
+ # The MPAA rating, i.e. "PG-13"
167
+ def mpaa
168
+ document.search("//h5[text()^='MPAA']/..").text.gsub('MPAA:', '').strip rescue nil
169
+ end
170
+
171
+ # older films may not have MPAA ratings but usually have a certification.
172
+ # return a hash with country abbreviations for keys and the certification string for the value
173
+ # example: {'USA' => 'Approved'}
174
+ def certifications
175
+ cert_hash = {}
176
+ certs = document.search("h5[text()='Certification:'] ~ a[@href*=/List?certificates']").map { |link| link.innerHTML.strip } rescue []
177
+ certs.each { |line| cert_hash[$1] = $2 if line =~ /(.*):(.*)/ }
178
+ cert_hash
179
+ end
180
+
181
+ private
182
+
183
+ # def update_title
184
+ # @title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
185
+ # #document.at("div#tn15title h1").innerHTML.split('<span>').first.unescape_html rescue nil
186
+ # end
187
+
188
+ MAX_ATTEMPTS = 3
189
+ SECONDS_BETWEEN_RETRIES = 1.0
190
+
191
+ # Fetch the document with retry to handle the occasional glitches
192
+ def document
193
+ attempts = 0
194
+ begin
195
+ html = open(self.url).read
196
+ @document ||= Hpricot(html)
197
+ cache_html_files(html) if ImdbMovie::use_html_cache
198
+ rescue Exception => e
199
+ attempts += 1
200
+ if attempts > MAX_ATTEMPTS
201
+ raise
202
+ else
203
+ sleep SECONDS_BETWEEN_RETRIES
204
+ retry
205
+ end
206
+ end
207
+ @document
208
+ end
209
+
210
+ # this is used to save imdb pages so they may be used by rspec
211
+ def cache_html_files(html)
212
+ begin
213
+ filespec = self.url.gsub(/^http:\//, 'spec/samples').gsub(/\/$/, '.html')
214
+ unless File.exist?(filespec)
215
+ puts filespec
216
+ File.mkdirs(File.dirname(filespec))
217
+ File.open(filespec, 'w') { |f| f.puts html }
218
+ end
219
+ rescue Exception => eMsg
220
+ puts eMsg.to_s
221
+ end
222
+ end
223
+
224
+ end
@@ -0,0 +1,111 @@
1
+ class ImdbSearch
2
+
3
+ attr_reader :query
4
+
5
+ def initialize(query, search_akas=false)
6
+ @query = query
7
+ @search_akas = search_akas
8
+ end
9
+
10
+ def movies
11
+ @movies ||= parse_movies_from_document
12
+ end
13
+
14
+ # Find the IMDB ID for the current search title
15
+ # The find can be helped a lot by including a years option that contains
16
+ # an Array of integers that are the production year (plus/minus a year)
17
+ # and the release year.
18
+ def find_id(options={})
19
+ id = nil
20
+ found_movies = self.movies
21
+ unless found_movies.nil?
22
+ desired_movies = found_movies.select do |m|
23
+ aka = m.also_known_as
24
+ result = imdb_compare_titles(m.title, aka, @query) && !m.video_game? && !m.release_year.nil?
25
+ if result
26
+ unless options[:years].nil?
27
+ result = options[:years].include?(m.release_year.to_i)
28
+ end
29
+ end
30
+ result
31
+ end
32
+ ids = desired_movies.collect{|m| m.id}.uniq.compact
33
+ if ids.length == 1
34
+ id = "tt#{ids[0]}"
35
+ end
36
+ end
37
+ id
38
+ end
39
+
40
+ protected
41
+
42
+ # compare the imdb title and the imdb title's AKAs against the media title.
43
+ # note, on exact match lookups, IMDB will sometimes set the title to
44
+ # 'trailers and videos' instead of the correct title.
45
+ def imdb_compare_titles(imdb_title, aka_titles, media_title)
46
+ result = fuzzy_compare_titles(imdb_title, media_title)
47
+ unless result
48
+ result = fuzzy_compare_titles(imdb_title, 'trailers and videos')
49
+ unless result
50
+ aka_titles.each do |aka|
51
+ result = fuzzy_compare_titles(aka, media_title)
52
+ break if result
53
+ end
54
+ end
55
+ end
56
+ result
57
+ end
58
+
59
+ # a fuzzy compare that is case insensitive and replaces '&' with 'and'
60
+ # (because that is what IMDB occasionally does)
61
+ def fuzzy_compare_titles(title1, title2)
62
+ t1 = title1.downcase
63
+ t2 = title2.downcase
64
+ (t1 == t2) ||
65
+ (t1.gsub(/&/, 'and') == t2.gsub(/&/, 'and')) ||
66
+ (t1.gsub(/[-:]/, ' ') == t2.gsub(/[-:]/, ' ')) ||
67
+ (t1.gsub('more at imdbpro ?', '') == t2)
68
+ end
69
+
70
+ private
71
+
72
+ def document
73
+ filespec = "http://www.imdb.com/find?q=#{CGI::escape(@query)};s=tt"
74
+ @document ||= Hpricot(open(filespec).read)
75
+ end
76
+
77
+ def parse_movies_from_document
78
+ exact_match? ? parse_exact_match_search_results : parse_multi_movie_search_results
79
+ end
80
+
81
+ def parse_exact_match_search_results
82
+ id = document.at("a[@name='poster']")['href'][/\d+$/]
83
+ title = document.at("h1").innerHTML.split('<span').first.strip.unescape_html rescue nil
84
+ [ImdbMovie.new(id, title)]
85
+ end
86
+
87
+ def parse_multi_movie_search_results
88
+ ids_and_titles = document.search('a[@href^="/title/tt"]').reject do |element|
89
+ element.innerHTML.strip_tags.empty?
90
+ end.map do |element|
91
+ [element['href'][/\d+/], element.innerHTML.strip_tags.unescape_html]
92
+ end.uniq
93
+
94
+ films = ids_and_titles.map do |id_and_title|
95
+ ImdbMovie.new(id_and_title[0], id_and_title[1])
96
+ end.uniq
97
+
98
+ if films.length > 1 && @search_akas
99
+ films = films.select do |m|
100
+ aka = m.also_known_as
101
+ imdb_compare_titles(m.title, aka, @query) && !m.video_game?
102
+ end
103
+ end
104
+ films
105
+ end
106
+
107
+ def exact_match?
108
+ document.search("title[text()='IMDb Title Search']").empty? && !document.search("a[@name='poster']").empty?
109
+ end
110
+
111
+ end
@@ -0,0 +1,19 @@
1
+ # == Synopsis
2
+ # add a blank? method to all Objects
3
+ class Object
4
+ # return asserted if object is nil or empty
5
+ # TODO: not the safest coding, probably should dup before stripping. Maybe should also compact
6
+ def blank?
7
+ result = nil?
8
+ unless result
9
+ if respond_to? 'empty?'
10
+ if respond_to? 'strip'
11
+ result = strip.empty?
12
+ else
13
+ result = empty?
14
+ end
15
+ end
16
+ end
17
+ result
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ require 'cgi'
2
+ require 'iconv'
3
+
4
+ module ImdbStringExtensions
5
+
6
+ def unescape_html
7
+ Iconv.conv("UTF-8", 'ISO-8859-1', CGI::unescapeHTML(self))
8
+ end
9
+
10
+ def strip_tags
11
+ gsub(/<\/?[^>]*>/, "")
12
+ end
13
+
14
+ end
15
+
16
+ String.send :include, ImdbStringExtensions
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: royw-imdb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.8
5
+ platform: ruby
6
+ authors:
7
+ - Sergio Gil
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-03-21 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: chronic
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description:
36
+ email: sgilperez@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/imdb/imdb_movie.rb
45
+ - lib/imdb/imdb_search.rb
46
+ - lib/imdb/imdb_image.rb
47
+ - lib/imdb.rb
48
+ - lib/file_extensions.rb
49
+ - lib/object_extensions.rb
50
+ - lib/string_extensions.rb
51
+ - README
52
+ has_rdoc: false
53
+ homepage:
54
+ post_install_message:
55
+ rdoc_options: []
56
+
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ version:
71
+ requirements: []
72
+
73
+ rubyforge_project:
74
+ rubygems_version: 1.2.0
75
+ signing_key:
76
+ specification_version: 2
77
+ summary: Internet Movie DataBase
78
+ test_files: []
79
+