jsahlen-imdb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +21 -0
- data/lib/imdb.rb +19 -0
- data/lib/imdb/movie.rb +83 -0
- data/lib/imdb/search.rb +50 -0
- metadata +76 -0
data/README.markdown
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
IMDb Parser
|
2
|
+
===========
|
3
|
+
|
4
|
+
Built this as a learning excercise, and because the existing libraries didn't work exactly as I wanted them to.
|
5
|
+
|
6
|
+
This is an extremely early version, which will likely be severely refactored as it progresses.
|
7
|
+
|
8
|
+
Search example
|
9
|
+
--------------
|
10
|
+
|
11
|
+
This is the only functionality that works right now.
|
12
|
+
|
13
|
+
require 'imdb'
|
14
|
+
|
15
|
+
results = IMDB::Search.new("howl")
|
16
|
+
results.each do |movie|
|
17
|
+
puts "#{movie.title} (#{movie.year}) [#{movie.id}]"
|
18
|
+
unless movie.aka.empty?
|
19
|
+
movie.aka.each { |aka| puts " a.k.a. #{aka}" }
|
20
|
+
end
|
21
|
+
end
|
data/lib/imdb.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module IMDB
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'cgi'
|
6
|
+
require 'iconv'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
FILES=%w{search movie}
|
10
|
+
FILES.each { |f| require File.join(File.dirname(__FILE__), 'imdb', f) }
|
11
|
+
|
12
|
+
TITLES_SEARCH_URL="http://www.imdb.com/find?s=tt&q="
|
13
|
+
TITLE_URL="http://www.imdb.com/title/"
|
14
|
+
|
15
|
+
def self.str_to_utf8(str)
|
16
|
+
Iconv.conv('UTF-8', 'LATIN1', str)
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
data/lib/imdb/movie.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
module IMDB
|
2
|
+
|
3
|
+
class Movie
|
4
|
+
attr_accessor :id, :title, :year, :aka, :url, :director, :extra
|
5
|
+
|
6
|
+
def initialize(attributes={})
|
7
|
+
self.extra = {}
|
8
|
+
|
9
|
+
attributes.each_pair do |key, value|
|
10
|
+
self.instance_variable_set "@#{key}", value
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.new_from_doc(doc)
|
15
|
+
movie = self.new
|
16
|
+
movie.parse_doc(doc)
|
17
|
+
movie
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.new_from_id(id)
|
21
|
+
movie = self.new
|
22
|
+
movie.id = id.to_s
|
23
|
+
movie.get_full_details
|
24
|
+
movie
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_full_details
|
28
|
+
doc = Hpricot(open(IMDB::TITLE_URL+CGI.escape(self.id)))
|
29
|
+
self.parse_doc(doc)
|
30
|
+
self
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_doc(doc)
|
34
|
+
akalink = (doc/"a[@href$='releaseinfo#akas']")[0]
|
35
|
+
|
36
|
+
self.id = (doc/"a[@href$='fullcredits]")[0].attributes["href"][/title\/(.*?)\//, 1]
|
37
|
+
self.title = IMDB.str_to_utf8(CGI.unescapeHTML((doc/"h1").inner_html[/^(.*?)(?: <span)/, 1]))
|
38
|
+
self.year = (doc/"h1 a[@href^='/Sections/Years/']").inner_html
|
39
|
+
self.aka = akalink ? akalink.parent.inner_html.scan(/(?:>)([^<]*?)\(/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) } : []
|
40
|
+
|
41
|
+
parse_full_details(doc)
|
42
|
+
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_full_details(doc)
|
47
|
+
director_links = (doc/"#director-info/a")
|
48
|
+
writer_links = (doc/"a[@onclick*='writerlist']")
|
49
|
+
tagline_header = (doc/"h5[text()='Tagline:']")[0]
|
50
|
+
plot_link = (doc/"a[@href$='/plotsummary]")[0]
|
51
|
+
mpaa_link = (doc/"a[@href='/mpaa']")[0]
|
52
|
+
|
53
|
+
if director_links.length > 0
|
54
|
+
self.director = director_links.collect { |l| IMDB.str_to_utf8(CGI.unescapeHTML(l.inner_html)) }
|
55
|
+
end
|
56
|
+
|
57
|
+
self.extra["writers"] = writer_links.collect { |w| IMDB.str_to_utf8(CGI.unescapeHTML(w.inner_html)) } if writer_links
|
58
|
+
self.extra["tagline"] = IMDB.str_to_utf8(CGI.unescapeHTML(tagline_header.parent.inner_html[/\/h5>(.+?)(<|$)/m, 1].strip)) if tagline_header
|
59
|
+
self.extra["plot"] = IMDB.str_to_utf8(CGI.unescapeHTML(plot_link.parent.inner_html[/\/h5>(.+?)<a/m, 1].strip)) if plot_link
|
60
|
+
self.extra["mpaa_rating"] = IMDB.str_to_utf8(CGI.unescapeHTML(mpaa_link.parent.parent.inner_html[/\/h5>(.+)$/m, 1].strip)) if mpaa_link
|
61
|
+
self.extra["cast"] = ((doc/"table.cast")[0]/"tr").collect { |tr| { "actor" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.nm")[0].inner_text)), "character" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.char")[0].inner_text)) } }
|
62
|
+
|
63
|
+
self
|
64
|
+
end
|
65
|
+
|
66
|
+
def id=(id)
|
67
|
+
@id = id
|
68
|
+
self.url = IMDB::TITLE_URL + id
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_json(*a)
|
72
|
+
{
|
73
|
+
"id" => self.id,
|
74
|
+
"title" => self.title,
|
75
|
+
"year" => self.year,
|
76
|
+
"aka" => self.aka,
|
77
|
+
"director" => self.director,
|
78
|
+
"extra" => self.extra
|
79
|
+
}.to_json(*a)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/lib/imdb/search.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
module IMDB
|
2
|
+
|
3
|
+
class Search
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
attr_accessor :results
|
7
|
+
|
8
|
+
def initialize(title, opts={})
|
9
|
+
@limit = opts[:limit] || 0
|
10
|
+
|
11
|
+
if title =~ /^\s*$/
|
12
|
+
self.results = []
|
13
|
+
return self.results
|
14
|
+
end
|
15
|
+
|
16
|
+
doc = Hpricot(open(IMDB::TITLES_SEARCH_URL+CGI.escape(title)))
|
17
|
+
|
18
|
+
# Single match
|
19
|
+
unless (doc/"div#tn15.maindetails").empty?
|
20
|
+
self.results = [Movie.new_from_doc(doc)]
|
21
|
+
|
22
|
+
# Search result
|
23
|
+
else
|
24
|
+
links = (doc/"td > a[@href^='/title/']").delete_if { |a| a.inner_html =~ /^</ }
|
25
|
+
self.results = links.collect! do |a|
|
26
|
+
td = a.parent
|
27
|
+
movie = Movie.new
|
28
|
+
|
29
|
+
movie.id = a.attributes["href"][/^\/title\/([^\/]+)/, 1]
|
30
|
+
movie.title = IMDB.str_to_utf8(CGI.unescapeHTML(a.inner_html))
|
31
|
+
movie.year = td.inner_html[/<\/a>\s\((\d+)\)/, 1]
|
32
|
+
movie.aka = td.inner_html.scan(/aka\s+<em>"([^"]+)"<\/em>/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) }
|
33
|
+
|
34
|
+
movie
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
self.results = self.results.slice(0, @limit) if @limit != 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def each
|
42
|
+
self.results.each { |x| yield x }
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_json(*a)
|
46
|
+
self.results.to_json(*a)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jsahlen-imdb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Johan Sahl\xC3\xA9n"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-14 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: json
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: A simple IMDb scraper
|
36
|
+
email: johan.sahlen@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README.markdown
|
43
|
+
files:
|
44
|
+
- README.markdown
|
45
|
+
- lib/imdb.rb
|
46
|
+
- lib/imdb/movie.rb
|
47
|
+
- lib/imdb/search.rb
|
48
|
+
has_rdoc: false
|
49
|
+
homepage: http://github.com/jsahlen/imdb
|
50
|
+
licenses:
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options:
|
53
|
+
- --charset=UTF-8
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.3.5
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: A simple IMDb scraper
|
75
|
+
test_files: []
|
76
|
+
|