RubyGems - nicoscraper - Versions diffs - 0.1.0 - Mend

nicoscraper 0.1.0

Files changed (18) hide show

@@ -0,0 +1,6 @@
+require 'rubygems'
+require 'ruby-debug'
+require 'movie'
+require 'mylist'
+require 'getmovie'

data/lib/parser.rb ADDED

@@ -0,0 +1,247 @@
+# -*- encoding: utf-8 -*-
+require 'rubygems'
+require 'xml'
+require 'time'
+require 'converter'
+module NicoParser
+  public
+  def getThumbInfo(xml)
+    doc = XML::Reader.string(
+      xml,
+      :options => XML::Parser::Options::NOBLANKS |
+      XML::Parser::Options::NOENT
+    )
+    n = -1
+    parsed = {}
+    category = ""
+    while doc.read
+      unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
+        case doc.name
+        when "video_id", "title", "description", "thumbnail_url",
+              "movie_type", "last_res_body" , "watch_url", "thumb_type"
+          label = doc.name
+          doc.read
+          parsed[label] = doc.value
+        when "size_high", "size_low", "view_counter", "comment_num",
+              "mylist_counter", "embeddable", "no_live_play",
+              "user_id"
+          label = doc.name
+          doc.read
+          parsed[label] = doc.value.to_i
+        when "first_retrieve"
+          label = doc.name
+          doc.read
+          parsed[label] =  Convert.iso8601ToUnix(doc.value)
+        when "length"
+          doc.read
+          lengthStr = doc.value.split(/\:/)
+          length   = lengthStr[0].to_i * 60 + lengthStr[1].to_i
+          parsed["length"] =  length
+        when "tags"
+          doc.move_to_attribute("domain")
+          category = doc.value
+          if defined? parsed["tags" + category]
+            parsed["tags_" + category] = []
+          end
+        when "tag"
+          doc.read
+          parsed["tags_" + category].push(doc.value)
+        end
+      end
+    end
+    doc.close
+    parsed
+  end
+  def tagRss(xml)
+    doc = XML::Reader.string(
+      xml,
+      :options => XML::Parser::Options::NOBLANKS |
+      XML::Parser::Options::NOENT
+    )
+    n = -1
+    parsed = [{}]
+    while doc.read
+      unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
+        case doc.name
+        when "entry"
+          n += 1
+          parsed[n] = {}
+        when "title"
+          doc.read
+          parsed[n]["title"] =  doc.value
+        when "link"
+          doc.move_to_attribute("href")
+          parsed[n]["video_id"] =  doc.value.split('/')[4]
+        when "published", "updated"
+          label = doc.name
+          doc.read
+          parsed[n][label] =  Convert.iso8601ToUnix(doc.value)
+        when "p"
+          doc.move_to_attribute("class")
+          case doc.value
+          when "nico-thumbnail"
+            doc.read
+            doc.move_to_attribute("src")
+            parsed[n]["thumbnail_url"] =  doc.value
+          when "nico-description"
+            doc.read
+            parsed[n]["description"] =  doc.value
+          end
+        when "strong"
+          doc.move_to_attribute("class")
+          case doc.value
+          when "nico-info-length"
+            doc.read
+            lengthStr = doc.value.split(/\:/)
+            length   = lengthStr[0].to_i * 60 + lengthStr[1].to_i
+            parsed[n]["length"] =  length
+          when "nico-numbers-view", "nico-numbers-res",
+                "nico-numbers-mylist"
+            label = doc.value
+            doc.read
+            parsed[n][label.slice(13,99)] =  doc.value.to_i
+          end
+        end
+      end
+    end
+    doc.close
+    parsed
+  end
+  def mylistRss(xml)
+    doc = XML::Reader.string(
+      xml,
+      :options => XML::Parser::Options::NOBLANKS |
+      XML::Parser::Options::NOENT
+    )
+    n = -1
+    parsed = { "mylist" => {}, "entry" => [{}] }
+    while doc.read
+      unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
+        case doc.name
+        # <title> and <id> are marked up both in mylist and
+        # each entry's node. So we need to assign the value to the
+        # appropriate variable in accordance with node's location.
+        when "title"
+          if n == -1
+            doc.read
+            d = doc.value
+            tmp = doc.value.slice(6, 99)
+            tmp = tmp.slice(0, tmp.length - 7)
+            parsed["mylist"]["title"] = tmp
+          else
+            doc.read
+            parsed["entry"][n]["title"] = doc.value
+          end
+        when "link"
+          if n != -1
+            doc.move_to_attribute("href")
+            parsed["entry"][n]["video_id"] =
+              Extract.videoId(doc.value)
+          end
+        when "subtitle"
+          doc.read
+          parsed["entry"][n]["description"] = doc.value
+        when "id"
+          if n == -1
+            doc.read
+            parsed["mylist"]["mylist_id"] =
+              Extract.mylistId(doc.value)
+          else
+            doc.read
+            parsed["entry"][n]["item_id"] =
+                Extract.itemId(doc.value)
+          end
+        when "updated"
+          doc.read
+          parsed["mylist"]["updated"] =
+            Convert.iso8601ToUnix(doc.value)
+        when "name"
+          doc.read
+          parsed["mylist"]["author"] = doc.value
+        when "entry"
+          n += 1
+          parsed["entry"][n] = {}
+        when "content"
+          doc.read
+          html = doc.value
+          memo =
+            html.slice(
+              /<p\sclass\=\"nico-memo\"\>[^\<]{1,}/
+            ).to_s.slice(21, 999)
+          /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
+          thumbnail_url = $2
+          description =
+            html.slice(
+              /<p\sclass\=\"nico-description\"\>[^\<]{1,}/
+            ).to_s.slice(31, 999)
+          length =
+            Convert.toSeconds(
+              html.slice(
+                /<strong\sclass\=\"nico-info-length\"\>[^\<]{1,}/
+              ).to_s.slice(33, 999)
+            )
+          first_retrieve =
+            Convert.japToUnix(
+              html.slice(
+                /<strong\sclass\=\"nico-info-date\"\>[^\<]{1,}/
+              ).to_s.slice(31, 999)
+            )
+          view =
+            Convert.commaRemover(
+              html.slice(
+                /<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
+              ).to_s.slice(34, 999)
+            )
+          res =
+            Convert.commaRemover(
+              html.slice(
+                /<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
+              ).to_s.slice(33, 999)
+            )
+          mylist =
+            Convert.commaRemover(
+              html.slice(
+                /<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
+              ).to_s.slice(36, 999)
+            )
+          parsed["entry"][n]["memo"] = memo
+          parsed["entry"][n]["thumbnail_url"] = thumbnail_url
+          parsed["entry"][n]["description"] = description
+          parsed["entry"][n]["length"] = length
+          parsed["entry"][n]["first_retrieve"] = first_retrieve
+          parsed["entry"][n]["view"] = view
+          parsed["entry"][n]["res"] = res
+          parsed["entry"][n]["mylist"] = mylist
+        end
+      end
+    end
+    doc.close
+    parsed
+  end
+  module_function :tagRss
+  module_function :mylistRss
+  module_function :getThumbInfo
+end

data/lib/searcher.rb ADDED

@@ -0,0 +1,205 @@
+# -*- encoding: utf-8 -*-
+require 'rubygems'
+require 'ruby-debug'
+require 'time'
+require 'mechanize'
+require 'kconv'
+require 'parser'
+$wait_byTag = {
+  'consec_count'  => 10,  # 連続してリクエストする回数
+  'consec_wait'   => 10,  # 連続リクエスト後のウェイト
+  'each'          => 10,  # 連続リクエスト時の、1リクエスト毎のウェイト
+  'rejected'      => 120, # アクセス拒絶時（「短時間での連続アクセスは・・・」）
+                          # の場合の再試行までの時間
+  '403'           => 600, # "403"時の再試行までのウェイト
+  'increment'     => 1,   # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
+  'timeout'       => 5,   # タイムアウト時の、再試行までのウェイト
+  '500'           => 600, # "500"時の再試行までのウェイト
+  '503'           => 600, # "503"時の再試行までのウェイト
+  'allowance_time'=> 5    # 再試行回数の限度
+}
+$wait_byMylistLt = {
+  'consec_count'  => 10,
+  'consec_wait'   => 10,
+  'each'          => 10,
+  'rejected'      => 120,
+  '403'           => 600,
+  'increment'     => 1,
+  'timeout'       => 5,
+  '500'           => 600,
+  '503'           => 600,
+  'allowance_time'=> 5
+}
+module GetMovie
+  public
+  def byTag (tag, sort, waitObj, &block)
+    gMByTag = GetMovieByTag.new()
+    gMByTag.execute(tag, sort, waitObj) { |result, page|
+      block.call(result, page)
+    }
+  end
+  def byTagLt (tag, sort, waitObj, &block)
+    gMByTagLt = GetMovieByTagLt.new()
+    gMByTagLt.execute(tag, sort, waitObj) { |result, page|
+      block.call(result, page)
+    }
+  end
+  module_function :byTag
+  module_function :byTagLt
+end
+class GetMovieByTagSuper
+  private
+  def get (tag, sort, page, method, waitObj)
+    paramAry = []
+    case sort
+      when 'comment_new'
+        sortStr = ''
+      when 'comment_old'
+        sortStr = 'order=a'
+      when 'view_many'
+        sortStr = 'sort=v'
+      when 'view_few'
+        sortStr = 'sort=v&order=a'
+      when 'comment_many'
+        sortStr = 'sort=r'
+      when 'comment_few'
+        sortStr = 'sort=r&order=a'
+      when 'mylist_many'
+        sortStr = 'sort=m'
+      when 'mylist_few'
+        sortStr = 'sort=m&order=a'
+      when 'post_new'
+        sortStr = 'sort=f'
+      when 'post_old'
+        sortStr = 'sort=f&order=a'
+      when 'length_long'
+        sortStr = 'sort=l'
+      when 'length_short'
+        sortStr = 'sort=l&order=a'
+    end
+    if page != 1 then paramAry.push("page=#{page}"); end
+    paramAry.push(sortStr)
+    if method == "atom" then paramAry.push("rss=atom&numbers=1") end
+    param = tag + "?" + paramAry.join('&')
+    host = 'www.nicovideo.jp'
+    entity = '/tag/' + param
+    @con.setWait(waitObj)
+    @con.get(host, entity)
+  end
+  public
+  def loop (tag, sort, method, waitObj, &block)
+    termFlag = false
+    page   = 1
+    begin
+       result  = []
+       response = get(
+        tag,
+        sort,
+        page,
+        method,
+        waitObj
+      )
+      if response
+        result = parse(response)
+        termFlag = block.call(result, page)
+      else
+        termFlag = true
+      end
+      page += 1
+    end until termFlag
+  end
+end
+class GetMovieByTag < GetMovieByTagSuper
+  def initialize
+    @NumOfSearched = 32
+    @incrAmt = 0.2
+    @con = Connector.new('mech')
+    # HTML中の各パラメータの所在を示すXPath
+    @videoIdXP  = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
+    @lengthXP   = "//div[@class='uad_thumbfrm']/table/tr/td/p[2]/span"
+    @viewXP     = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[1]/strong"
+    @resXP      = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[2]/strong"
+    @mylistXP   = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[3]/a/strong"
+    @adXP       = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[4]/a/strong"
+  end
+  def parse(movieNum)
+    result = []
+    video_id  = /(sm|nm)[0-9]{1,}/.match(@con.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
+      lengthStr = @con.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
+    length    = lengthStr[0].to_i * 60 + lengthStr[1].to_i
+    view      = @con.mech.page.search(@viewXP)[movieNum]
+                .text.gsub(/\,/, '').to_i
+    res       = @con.mech.page.search(@resXP)[movieNum]
+                .text.gsub(/\,/, '').to_i
+    mylist    = @con.mech.page.search(@mylistXP)[movieNum]
+                .text.gsub(/\,/, '').to_i
+    ad        = @con.mech.page.search(@adXP)[movieNum]
+                .text.gsub(/\,/, '').to_i
+    result.push({
+      "video_id"  => video_id,
+      "length"    => length,
+      "view"      => view,
+      "res"       => res,
+      "mylist"    => mylist,
+      "ad"        => ad
+    })
+  end
+  def execute(tag, sort, waitObj, &block)
+    loop(tag, sort, "mech", waitObj) { |result, page|
+      block.call(result, page)
+    }
+  end
+end
+class GetMovieByTagLt < GetMovieByTagSuper
+  def initialize
+    @NumOfSearched = 32
+    @incrAmt = 0.2
+    @con = Connector.new('atom')
+  end
+  def parse(xml)
+    NicoParser.tagRss(xml)
+  end
+  def execute(tag, sort, waitObj, &block)
+    loop(tag, sort, "atom", waitObj) { |result, page|
+      block.call(result, page)
+    }
+  end
+end