RubyGems - movie_crawler - Versions diffs - 0.1.1 → 0.2.2 - Mend

movie_crawler 0.1.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/.DS_Store +0 -0
data/README.md +7 -6
data/bin/app +6 -8
data/lib/.DS_Store +0 -0
data/lib/movie_crawler.rb +2 -1
data/lib/movie_crawler/.DS_Store +0 -0
data/lib/movie_crawler/crawler.rb +184 -131
data/lib/movie_crawler/version.rb +1 -1
data/movie_crawler.gemspec +2 -1
data/spec/movies_spec.rb +4 -4
data/spec/rank_spec.rb +1 -1
metadata +19 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3f6ebe9cba57dd662d4f04a78fc0891ad3c1f77f
-  data.tar.gz: 4e63f3330a19ebae78af35c75dc64c201e2ad72d
+  metadata.gz: 0e40451420c351951666d34b022d5a1fb79c31e6
+  data.tar.gz: 2caf81767ea7b73af694629c149c07e763921c31
 SHA512:
-  metadata.gz: 5843495e2bb5b0a9b5bad3ad57e995484d62e843f46ff0787c773ba9cb3320a028277032ad438c2bc59396a5484492691e06d43769a1981982cb3ec70d897285
-  data.tar.gz: 5e2eb53601932cc76f510770dd4c834f24f0afe1fa76543f1372a3dce82e3dc765e6e2f13a13826f4456b3b4474c043d2c4b8ec4b5849af7db32495c9adefd46
+  metadata.gz: 292e4344a628490095120a074e1b5c28fa624b4f7a986f2fbb51d301f5989ac3adb8f4727aa2d007115c7fb135cf8726e119e79d54487f7b0595ed7f3902ad8a
+  data.tar.gz: a41689f0cc79363c52ad158c4ab45327d0995474fab5ef5ba94d581532f99850e8bb63ab39c8c6a8c98e81695fe941f6d9d7cf7286ebea64bcf085790f4ef379

data/.DS_Store ADDED

Binary file

data/README.md CHANGED

@@ -8,7 +8,9 @@ MovieInfo tries to grabs some information on the [**@movies**](www.atmovies.com.
 ## About
-If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one
+If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one.
+0.2.1 new feature: add the function get_movie_info(movie_name) which will return the detail information of the movie if it is accessible from the atmovie.
 ## Usage
 This gem could be used as a command line utility or called from code
@@ -19,11 +21,11 @@ movie_crawler
 ### code example:
     require 'movie_crawler'
-    movie_list = MovieInfo.movies('FIRST_ROUND') # 'LATEST' or 'SECOND_ROUND'
+    movie_list = movies('FIRST_ROUND') # 'LATEST' or 'SECOND_ROUND'
     puts movie_list
-    dvd_rank = MovieInfo.dvd_rank
+    dvd_rank = dvd_rank
     puts dvd_rank
 ## Format
@@ -39,4 +41,3 @@ movie_crawler
 **runtime(minutes):** '96'
 **trailer:** http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id=fmus01587310

data/bin/app CHANGED

@@ -1,20 +1,18 @@
 #!/usr/bin/env ruby
 require 'movie_crawler'
+# require File.expand_path('../../lib/movie_crawler.rb', __FILE__)
 puts "\ndvd_rank:"
-puts MovieCrawler::MovieInfo.dvd_rank
+puts MovieCrawler.dvd_rank
 puts "\nus_weekend_rank:"
-puts MovieCrawler::MovieInfo.us_weekend
+puts MovieCrawler.us_weekend
 puts "\ntaipei_weekend_rank:"
-puts MovieCrawler::MovieInfo.taipei_weekend
+puts MovieCrawler.taipei_weekend
 puts "\nlastest_movie_list:"
-puts MovieCrawler::MovieInfo.movies('LATEST')
-puts "\nfirst_round_movie_list_:"
-puts MovieCrawler::MovieInfo.movies('FIRST_ROUND')
+puts MovieCrawler.movies('LATEST')
 puts "\nsecond_round_movie_list:"
-puts MovieCrawler::MovieInfo.movies('SECOND_ROUND')
+puts MovieCrawler.movies('SECOND_ROUND')

data/lib/.DS_Store ADDED

Binary file

data/lib/movie_crawler.rb CHANGED

@@ -1,2 +1,3 @@
+# require File.expand_path('../movie_crawler/crawler.rb', __FILE__)
 require 'movie_crawler/crawler.rb'
-require 'movie_crawler/version.rb'
+require 'movie_crawler/version.rb'

data/lib/movie_crawler/.DS_Store ADDED

Binary file

data/lib/movie_crawler/crawler.rb CHANGED

@@ -2,162 +2,215 @@ require 'nokogiri'
 require 'open-uri'
 require 'yaml'
 require 'iconv'
+require 'mechanize'
 module MovieCrawler
   # get the info from atmovies
-  class MovieInfo
-    URL_LIST = {
-      'LATEST' => 'http://www.atmovies.com.tw/movie/movie_new.html',
-      'SECOND_ROUND' => 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
-      # first_round is unnessary, the result is the same as latest.
-    }
-    MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
-    WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
-    WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
-    WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
-    WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
-    REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
-    REFLECTION_FS = '&fs=2'
-    REFLECTION_CLASS = "//div[@class = 'act01']"
-    REFLECTION_SATITLE = '&satitle='
-    REFLECTION_SAID = '&said='
-    REFLECTION_NAME = "//span[@class = 'at21b']"
-    TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
-    ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
-    # add three rank parser
-    def self.us_weekend
-      result = get_table('1')
-      to_yaml(result)
-    end
+  URL_LIST = {
+    'LATEST' => 'http://www.atmovies.com.tw/movie/movie_new.html',
+    'SECOND_ROUND' => 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
+    # first_round is unnessary, the result is the same as latest.
+  }
+  MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
+  WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
+  WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
+  WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
+  WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
+  REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
+  REFLECTION_FS = '&fs=2'
+  REFLECTION_CLASS = "//div[@class = 'act01']"
+  REFLECTION_SATITLE = '&satitle='
+  REFLECTION_SAID = '&said='
+  REFLECTION_NAME = "//span[@class = 'at21b']"
+  TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
+  ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
+  SEARCH_URL = 'http://search.atmovies.com.tw/search/search.cfm'
+  # start a Mechanize agent for crawling from web to web
+  def self.start_mechan
+    agent = Mechanize.new
+    agent.user_agent_alias = 'Mac Safari'
+    agent.post(SEARCH_URL)
+    search_page = agent.post(SEARCH_URL)
+    search_page.form_with(action: 'search.cfm')
+  end
-    def self.taipei_weekend
-      result = get_table('2')
-      to_yaml(result)
-    end
+  # finish a post request to search for specific film
+  def self.trace_page(form, movie_name)
+    form.search_term = movie_name # movie name
+    search_result = form.submit(form.button_with(name: 'search'))
+    link = search_result.link_with(href: /F&d=/)
+    link.click
+  end
-    def self.dvd_rank
-      result = get_table('3')
-      to_yaml(result)
+  def self.parse_comment(film_page)
+    begin
+      comment_data = film_page.at('.comment_data').text
+                     .gsub(/[\t\r\n]+/, "\t")
+                     .strip.split("\t")
+    rescue
+      comment_data = 'not exised'
     end
+  end
-    # parse the ranktable info
-    def self.get_table(rankid)
-      doc = open_html(ATMOVIES_MAIN_URL)
-      table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
-      table = table.gsub(' : ', ':').gsub(' ', '').split
-      table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
-      table.pop
-      rankmix(table)
-    end
+  def self.parse_crew(film_page)
+    crew_row = film_page.at('.crew_row').text.gsub(/[\t\r\n]+/, "\t").strip.split(/[\t+]/)
+    crew_row = crew_row.collect { |a| a.strip }
+    crew_row -= ['']
+  end
-    # mix the rank info
-    def self.rankmix(t)
-      t.each_with_index.map do |_, index|
-        {
-          index + 1 => t[index].to_s
-        }
-      end
-    end
+  # parse the film info in the target web page
+  def self.parse_movie_info(film_page)
+    name = film_page.at('.name .at21b').text.strip
+    schedule = film_page.at('#movie_info01 b').text.strip
+    sub_content = film_page.at('.sub_content').text.strip
+    crew_row = parse_crew(film_page)
+    comment_data = parse_comment(film_page)
+    { 'name' => name, 'schedule' => schedule,
+      'crew_info' => crew_row, 'comment' => comment_data,
+      'content' => sub_content }
+  end
-    # switch to different url accordingly
-    def self.movies(category = 'LATEST')
-      result = movies_parser(category)
-      to_yaml(result)
-    end
+  # combine the workflow for user to call for movie info
+  def self.get_movie_info(movie_name)
+    form = start_mechan
+    film_page = trace_page(form, movie_name)
+    collection = parse_movie_info(film_page)
+    to_yaml(collection)
+  end
-    # parse the movies acoordingly
-    def self.movies_parser(category)
-      url = URL_LIST[category.upcase]
-      document = open_html(url)
-      titles = get_titles(document)
-      stories = get_stories(document)
-      dates = get_dates(document)
-      trailers = get_trailer(document)
-      runtimes = get_runtime(document)
-      mix(titles, stories, dates, runtimes, trailers)
-    end
+  # add three rank parser
+  def self.us_weekend
+    result = get_table('1')
+    to_yaml(result)
+  end
-    def self.encode_zh(text)
-      REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
-    end
+  def self.taipei_weekend
+    result = get_table('2')
+    to_yaml(result)
+  end
-    def self.get_one_movie_name(doc)
-      name = doc.xpath(REFLECTION_NAME).text
-      name.gsub!(/[\t\r\n]/, '')
-    end
+  def self.dvd_rank
+    result = get_table('3')
+    to_yaml(result)
+  end
-    def self.get_reflection(doc)
-      doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
-    end
+  # parse the ranktable info
+  def self.get_table(rankid)
+    doc = open_html(ATMOVIES_MAIN_URL)
+    table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
+    table = table.gsub(' : ', ':').gsub(' ', '').split
+    table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
+    table.pop
+    rankmix(table)
+  end
-    # get the details of movie
-    def self.movie_details(code)
-      open_html(MOVIE_BASE_URL + code + '/')
+  # mix the rank info
+  def self.rankmix(t)
+    t.each_with_index.map do |_, index|
+      {
+        index + 1 => t[index].to_s
+      }
     end
+  end
-    # open the destination url
-    def self.open_html(url)
-      Nokogiri::HTML(open(url))
-    end
+  # switch to different url accordingly
+  def self.movies(category = 'LATEST')
+    result = movies_parser(category)
+    to_yaml(result)
+  end
-    # get the movie name
-    def self.get_titles(doc)
-      titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
-      titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
-    end
+  # parse the movies acoordingly
+  def self.movies_parser(category)
+    url = URL_LIST[category.upcase]
+    document = open_html(url)
+    titles = get_titles(document)
+    stories = get_stories(document)
+    dates = get_dates(document)
+    trailers = get_trailer(document)
+    runtimes = get_runtime(document)
+    mix(titles, stories, dates, runtimes, trailers)
+  end
-    # get the storyline of movie
-    def self.get_stories(doc)
-      storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
-      storylines.map(&:text) # { |story| story.text }
-    end
+  def self.encode_zh(text)
+    encoded = URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
+    REFLECTION_SATITLE + encoded
+  end
-    # get the runtime of movie
-    def self.get_runtime(doc)
-      days_times = split_day_and_time(doc)
-      days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
-    end
+  def self.get_one_movie_name(doc)
+    name = doc.xpath(REFLECTION_NAME).text
+    name.gsub!(/[\t\r\n]/, '')
+  end
-    # get the release date
-    def self.get_dates(doc)
-      days_times = split_day_and_time(doc)
-      days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
-    end
+  def self.get_reflection(doc)
+    doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
+  end
-    def self.split_day_and_time(doc)
-      gap = "\n\t\t\t\t\s\s\s\s\t"
-      days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
-      days_times.map { |d_t| d_t.text.split(gap) }
-    end
+  # get the details of movie
+  def self.movie_details(code)
+    open_html(MOVIE_BASE_URL + code + '/')
+  end
-    # get the code of movies
-    def self.get_codes(doc)
-      codes = doc.xpath(WHOLE_MOVIEWS_CODES)
-      codes.map { |code| code.value.split('/')[2] }
-    end
+  # open the destination url
+  def self.open_html(url)
+    Nokogiri::HTML(open(url))
+  end
-    # get the trailer link of the movies
-    def self.get_trailer(doc)
-      codes = get_codes(doc)
-      codes.map { |trailer| TRAILER_URL + trailer }
-    end
+  # get the movie name
+  def self.get_titles(doc)
+    titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
+    titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
+  end
-    # build the hash for yaml output
-    def self.mix(t, s, d, ti, tr)
-      informations = t.each_with_index.map do |_, index|
-        { 'title' => t[index], 'story' => s[index], \
-          'date' => d[index], 'runtime(minutes)' => ti[index], \
-          'trailer' => tr[index] }
-      end
-      informations
-    end
+  # get the storyline of movie
+  def self.get_stories(doc)
+    storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
+    storylines.map(&:text) # { |story| story.text }
+  end
-    # convert the schedules to yaml format
-    def self.to_yaml(mix)
-      mix.to_yaml
+  # get the runtime of movie
+  def self.get_runtime(doc)
+    days_times = split_day_and_time(doc)
+    days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
+  end
+  # get the release date
+  def self.get_dates(doc)
+    days_times = split_day_and_time(doc)
+    days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
+  end
+  def self.split_day_and_time(doc)
+    gap = "\n\t\t\t\t\s\s\s\s\t"
+    days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
+    days_times.map { |d_t| d_t.text.split(gap) }
+  end
+  # get the code of movies
+  def self.get_codes(doc)
+    codes = doc.xpath(WHOLE_MOVIEWS_CODES)
+    codes.map { |code| code.value.split('/')[2] }
+  end
+  # get the trailer link of the movies
+  def self.get_trailer(doc)
+    codes = get_codes(doc)
+    codes.map { |trailer| TRAILER_URL + trailer }
+  end
+  # build the hash for yaml output
+  def self.mix(t, s, d, ti, tr)
+    informations = t.each_with_index.map do |_, index|
+      { 'title' => t[index], 'story' => s[index], \
+        'date' => d[index], 'runtime(minutes)' => ti[index], \
+        'trailer' => tr[index] }
     end
+    informations
+  end
+  # convert the schedules to yaml format
+  def self.to_yaml(mix)
+    mix.to_yaml
   end
 end

data/lib/movie_crawler/version.rb CHANGED

@@ -1,3 +1,3 @@
 module MovieCrawler
-  VERSION = '0.1.1'
+  VERSION = '0.2.2'
 end

data/movie_crawler.gemspec CHANGED

@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
   s.name = 'movie_crawler'
   s.version = MovieCrawler::VERSION
   s.executables << 'app'
-  s.date = '2014-10-30'
+  s.date = '2014-11-02'
   s.summary = 'Grab the movies information from the atmovies.com'
   s.description = 'Grab the movies information from the atmovies.com'
   s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
@@ -17,4 +17,5 @@ Gem::Specification.new do |s|
   s.add_development_dependency 'minitest-rg'
   s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
   s.add_runtime_dependency 'iconv'
+  s.add_runtime_dependency 'mechanize'
 end

data/spec/movies_spec.rb CHANGED

@@ -2,13 +2,13 @@ require 'minitest/autorun'
 require 'minitest/rg'
 require File.expand_path('../../lib/movie_crawler', __FILE__)
-LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
-SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
+# LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
+# SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
 TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
 # generate a random number to test either latest or second_round
-rand < 0.5 ? url = LATEST : url = SECOND_ROUND
-sample = MovieCrawler::MovieInfo.movies_parser(url)
+rand < 0.5 ? url = 'LATEST' : url = 'SECOND_ROUND'
+sample = MovieCrawler.movies_parser(url)
 describe 'movies_parser should involve' do

data/spec/rank_spec.rb CHANGED

@@ -4,7 +4,7 @@ require File.expand_path('../../lib/movie_crawler', __FILE__)
 # 1 to 3 means diffent test case related to us, taipei, dvd
 rand_rank = rand(1..3)
-sample = MovieInfo.get_table(rand_rank.to_s)
+sample = MovieCrawler.get_table(rand_rank.to_s)
 describe 'table should involve' do

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: movie_crawler
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Lee Chen
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-10-30 00:00:00.000000000 Z
+date: 2014-11-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -68,6 +68,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Grab the movies information from the atmovies.com
 email: chung1350@hotmail.com
 executables:
@@ -75,13 +89,16 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
+- ".DS_Store"
 - ".gitignore"
 - ".travis.yml"
 - Gemfile
 - README.md
 - Rakefile
 - bin/app
+- lib/.DS_Store
 - lib/movie_crawler.rb
+- lib/movie_crawler/.DS_Store
 - lib/movie_crawler/crawler.rb
 - lib/movie_crawler/version.rb
 - movie_crawler-0.1.0.gem