RubyGems - jsahlen-imdb - Versions diffs - 0.1.0 - Mend

jsahlen-imdb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README.markdown ADDED

@@ -0,0 +1,21 @@
+IMDb Parser
+===========
+Built this as a learning excercise, and because the existing libraries didn't work exactly as I wanted them to.
+This is an extremely early version, which will likely be severely refactored as it progresses.
+Search example
+--------------
+This is the only functionality that works right now.
+    require 'imdb'
+    results = IMDB::Search.new("howl")
+    results.each do |movie|
+      puts "#{movie.title} (#{movie.year}) [#{movie.id}]"
+      unless movie.aka.empty?
+        movie.aka.each { |aka| puts "  a.k.a. #{aka}" }
+      end
+    end

data/lib/imdb.rb ADDED

@@ -0,0 +1,19 @@
+module IMDB
+  require 'open-uri'
+  require 'rubygems'
+  require 'hpricot'
+  require 'cgi'
+  require 'iconv'
+  require 'json'
+  FILES=%w{search movie}
+  FILES.each { |f| require File.join(File.dirname(__FILE__), 'imdb', f) }
+  TITLES_SEARCH_URL="http://www.imdb.com/find?s=tt&q="
+  TITLE_URL="http://www.imdb.com/title/"
+  def self.str_to_utf8(str)
+    Iconv.conv('UTF-8', 'LATIN1', str)
+  end
+end

data/lib/imdb/movie.rb ADDED

@@ -0,0 +1,83 @@
+module IMDB
+  class Movie
+    attr_accessor :id, :title, :year, :aka, :url, :director, :extra
+    def initialize(attributes={})
+      self.extra = {}
+      attributes.each_pair do |key, value|
+        self.instance_variable_set "@#{key}", value
+      end
+    end
+    def self.new_from_doc(doc)
+      movie = self.new
+      movie.parse_doc(doc)
+      movie
+    end
+    def self.new_from_id(id)
+      movie = self.new
+      movie.id = id.to_s
+      movie.get_full_details
+      movie
+    end
+    def get_full_details
+      doc = Hpricot(open(IMDB::TITLE_URL+CGI.escape(self.id)))
+      self.parse_doc(doc)
+      self
+    end
+    def parse_doc(doc)
+      akalink = (doc/"a[@href$='releaseinfo#akas']")[0]
+      self.id    = (doc/"a[@href$='fullcredits]")[0].attributes["href"][/title\/(.*?)\//, 1]
+      self.title = IMDB.str_to_utf8(CGI.unescapeHTML((doc/"h1").inner_html[/^(.*?)(?: <span)/, 1]))
+      self.year  = (doc/"h1 a[@href^='/Sections/Years/']").inner_html
+      self.aka   = akalink ? akalink.parent.inner_html.scan(/(?:>)([^<]*?)\(/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) } : []
+      parse_full_details(doc)
+      self
+    end
+    def parse_full_details(doc)
+      director_links = (doc/"#director-info/a")
+      writer_links   = (doc/"a[@onclick*='writerlist']")
+      tagline_header = (doc/"h5[text()='Tagline:']")[0]
+      plot_link      = (doc/"a[@href$='/plotsummary]")[0]
+      mpaa_link      = (doc/"a[@href='/mpaa']")[0]
+      if director_links.length > 0
+        self.director = director_links.collect { |l| IMDB.str_to_utf8(CGI.unescapeHTML(l.inner_html)) }
+      end
+      self.extra["writers"]     = writer_links.collect { |w| IMDB.str_to_utf8(CGI.unescapeHTML(w.inner_html)) } if writer_links
+      self.extra["tagline"]     = IMDB.str_to_utf8(CGI.unescapeHTML(tagline_header.parent.inner_html[/\/h5>(.+?)(<|$)/m, 1].strip)) if tagline_header
+      self.extra["plot"]        = IMDB.str_to_utf8(CGI.unescapeHTML(plot_link.parent.inner_html[/\/h5>(.+?)<a/m, 1].strip)) if plot_link
+      self.extra["mpaa_rating"] = IMDB.str_to_utf8(CGI.unescapeHTML(mpaa_link.parent.parent.inner_html[/\/h5>(.+)$/m, 1].strip)) if mpaa_link
+      self.extra["cast"]        = ((doc/"table.cast")[0]/"tr").collect { |tr| { "actor" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.nm")[0].inner_text)), "character" => IMDB.str_to_utf8(CGI.unescapeHTML((tr/"td.char")[0].inner_text)) } }
+      self
+    end
+    def id=(id)
+      @id = id
+      self.url = IMDB::TITLE_URL + id
+    end
+    def to_json(*a)
+      {
+        "id"       => self.id,
+        "title"    => self.title,
+        "year"     => self.year,
+        "aka"      => self.aka,
+        "director" => self.director,
+        "extra"    => self.extra
+      }.to_json(*a)
+    end
+  end
+end

data/lib/imdb/search.rb ADDED

@@ -0,0 +1,50 @@
+module IMDB
+  class Search
+    include Enumerable
+    attr_accessor :results
+    def initialize(title, opts={})
+      @limit = opts[:limit] || 0
+      if title =~ /^\s*$/
+        self.results = []
+        return self.results
+      end
+      doc = Hpricot(open(IMDB::TITLES_SEARCH_URL+CGI.escape(title)))
+      # Single match
+      unless (doc/"div#tn15.maindetails").empty?
+        self.results = [Movie.new_from_doc(doc)]
+      # Search result
+      else
+        links = (doc/"td > a[@href^='/title/']").delete_if { |a| a.inner_html =~ /^</ }
+        self.results = links.collect! do |a|
+          td = a.parent
+          movie = Movie.new
+          movie.id    = a.attributes["href"][/^\/title\/([^\/]+)/, 1]
+          movie.title = IMDB.str_to_utf8(CGI.unescapeHTML(a.inner_html))
+          movie.year  = td.inner_html[/<\/a>\s\((\d+)\)/, 1]
+          movie.aka   = td.inner_html.scan(/aka\s+<em>"([^"]+)"<\/em>/).collect { |x| IMDB.str_to_utf8(CGI.unescapeHTML(x[0].strip)) }
+          movie
+        end
+      end
+      self.results = self.results.slice(0, @limit) if @limit != 0
+    end
+    def each
+      self.results.each { |x| yield x }
+    end
+    def to_json(*a)
+      self.results.to_json(*a)
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,76 @@
+--- !ruby/object:Gem::Specification
+name: jsahlen-imdb
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- "Johan Sahl\xC3\xA9n"
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-14 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: json
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description: A simple IMDb scraper
+email: johan.sahlen@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.markdown
+files:
+- README.markdown
+- lib/imdb.rb
+- lib/imdb/movie.rb
+- lib/imdb/search.rb
+has_rdoc: false
+homepage: http://github.com/jsahlen/imdb
+licenses:
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: A simple IMDb scraper
+test_files: []