jsahlen-imdb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ IMDb Parser
2
+ ===========
3
+
4
+ Built this as a learning excercise, and because the existing libraries didn't work exactly as I wanted them to.
5
+
6
+ This is an extremely early version, which will likely be severely refactored as it progresses.
7
+
8
+ Search example
9
+ --------------
10
+
11
+ This is the only functionality that works right now.
12
+
13
+ require 'imdb'
14
+
15
+ results = IMDB::Search.new("howl")
16
+ results.each do |movie|
17
+ puts "#{movie.title} (#{movie.year}) [#{movie.id}]"
18
+ unless movie.aka.empty?
19
+ movie.aka.each { |aka| puts " a.k.a. #{aka}" }
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ module IMDB
2
+ require 'open-uri'
3
+ require 'rubygems'
4
+ require 'hpricot'
5
+ require 'cgi'
6
+ require 'iconv'
7
+ require 'json'
8
+
9
+ FILES=%w{search movie}
10
+ FILES.each { |f| require File.join(File.dirname(__FILE__), 'imdb', f) }
11
+
12
+ TITLES_SEARCH_URL="http://www.imdb.com/find?s=tt&q="
13
+ TITLE_URL="http://www.imdb.com/title/"
14
+
15
+ def self.str_to_utf8(str)
16
+ Iconv.conv('UTF-8', 'LATIN1', str)
17
+ end
18
+
19
+ end
@@ -0,0 +1,83 @@
1
+ module IMDB
2
+
3
+ class Movie
4
+ attr_accessor :id, :title, :year, :aka, :url, :director, :extra
5
+
6
+ def initialize(attributes={})
7
+ self.extra = {}
8
+
9
+ attributes.each_pair do |key, value|
10
+ self.instance_variable_set "@#{key}", value
11
+ end
12
+ end
13
+
14
+ def self.new_from_doc(doc)
15
+ movie = self.new
16
+ movie.parse_doc(doc)
17
+ movie
18
+ end
19
+
20
+ def self.new_from_id(id)
21
+ movie = self.new
22
+ movie.id = id.to_s
23
+ movie.get_full_details
24
+ movie
25
+ end
26
+
27
+ def get_full_details
28
+ doc = Hpricot(open(IMDB::TITLE_URL+CGI.escape(self.id)))
29
+ self.parse_doc(doc)
30
+ self
31
+ end
32
+
33
+ def parse_doc(doc)
34
+ akalink = (doc/"a[@href$='releaseinfo#akas']")[0]
35
+
36
+ self.id = (doc/"a[@href$='fullcredits]")[0].attributes["href"][/title\/(.*?)\//, 1]
37
+ self.title = IMDB.str_to_utf8(CGI.unescapeHTML((doc/"h1").inner_html[/^(.*?)(?: <span)/, 1]))
38
+ self.year = (doc/"h1 a[@href^='/Sections/Years/']").inner_html
39
+ self.aka = akalink ? akalink.parent.inner_html.scan(/(?:>)([^<]*?)\(/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) } : []
40
+
41
+ parse_full_details(doc)
42
+
43
+ self
44
+ end
45
+
46
+ def parse_full_details(doc)
47
+ director_links = (doc/"#director-info/a")
48
+ writer_links = (doc/"a[@onclick*='writerlist']")
49
+ tagline_header = (doc/"h5[text()='Tagline:']")[0]
50
+ plot_link = (doc/"a[@href$='/plotsummary]")[0]
51
+ mpaa_link = (doc/"a[@href='/mpaa']")[0]
52
+
53
+ if director_links.length > 0
54
+ self.director = director_links.collect { |l| IMDB.str_to_utf8(CGI.unescapeHTML(l.inner_html)) }
55
+ end
56
+
57
+ self.extra["writers"] = writer_links.collect { |w| IMDB.str_to_utf8(CGI.unescapeHTML(w.inner_html)) } if writer_links
58
+ self.extra["tagline"] = IMDB.str_to_utf8(CGI.unescapeHTML(tagline_header.parent.inner_html[/\/h5>(.+?)(<|$)/m, 1].strip)) if tagline_header
59
+ self.extra["plot"] = IMDB.str_to_utf8(CGI.unescapeHTML(plot_link.parent.inner_html[/\/h5>(.+?)<a/m, 1].strip)) if plot_link
60
+ self.extra["mpaa_rating"] = IMDB.str_to_utf8(CGI.unescapeHTML(mpaa_link.parent.parent.inner_html[/\/h5>(.+)$/m, 1].strip)) if mpaa_link
61
+ self.extra["cast"] = ((doc/"table.cast")[0]/"tr").collect { |tr| { "actor" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.nm")[0].inner_text)), "character" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.char")[0].inner_text)) } }
62
+
63
+ self
64
+ end
65
+
66
+ def id=(id)
67
+ @id = id
68
+ self.url = IMDB::TITLE_URL + id
69
+ end
70
+
71
+ def to_json(*a)
72
+ {
73
+ "id" => self.id,
74
+ "title" => self.title,
75
+ "year" => self.year,
76
+ "aka" => self.aka,
77
+ "director" => self.director,
78
+ "extra" => self.extra
79
+ }.to_json(*a)
80
+ end
81
+ end
82
+
83
+ end
@@ -0,0 +1,50 @@
1
+ module IMDB
2
+
3
+ class Search
4
+ include Enumerable
5
+
6
+ attr_accessor :results
7
+
8
+ def initialize(title, opts={})
9
+ @limit = opts[:limit] || 0
10
+
11
+ if title =~ /^\s*$/
12
+ self.results = []
13
+ return self.results
14
+ end
15
+
16
+ doc = Hpricot(open(IMDB::TITLES_SEARCH_URL+CGI.escape(title)))
17
+
18
+ # Single match
19
+ unless (doc/"div#tn15.maindetails").empty?
20
+ self.results = [Movie.new_from_doc(doc)]
21
+
22
+ # Search result
23
+ else
24
+ links = (doc/"td > a[@href^='/title/']").delete_if { |a| a.inner_html =~ /^</ }
25
+ self.results = links.collect! do |a|
26
+ td = a.parent
27
+ movie = Movie.new
28
+
29
+ movie.id = a.attributes["href"][/^\/title\/([^\/]+)/, 1]
30
+ movie.title = IMDB.str_to_utf8(CGI.unescapeHTML(a.inner_html))
31
+ movie.year = td.inner_html[/<\/a>\s\((\d+)\)/, 1]
32
+ movie.aka = td.inner_html.scan(/aka\s+<em>"([^"]+)"<\/em>/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) }
33
+
34
+ movie
35
+ end
36
+ end
37
+
38
+ self.results = self.results.slice(0, @limit) if @limit != 0
39
+ end
40
+
41
+ def each
42
+ self.results.each { |x| yield x }
43
+ end
44
+
45
+ def to_json(*a)
46
+ self.results.to_json(*a)
47
+ end
48
+ end
49
+
50
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jsahlen-imdb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - "Johan Sahl\xC3\xA9n"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-14 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: json
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: A simple IMDb scraper
36
+ email: johan.sahlen@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README.markdown
43
+ files:
44
+ - README.markdown
45
+ - lib/imdb.rb
46
+ - lib/imdb/movie.rb
47
+ - lib/imdb/search.rb
48
+ has_rdoc: false
49
+ homepage: http://github.com/jsahlen/imdb
50
+ licenses:
51
+ post_install_message:
52
+ rdoc_options:
53
+ - --charset=UTF-8
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.5
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: A simple IMDb scraper
75
+ test_files: []
76
+