RubyGems - baiduserp - Versions diffs - 0.1.1 → 2.0.0 - Mend

baiduserp 0.1.1 → 2.0.0

Files changed (11) hide show

data/lib/baiduserp/client.rb +9 -0
data/lib/baiduserp/helper.rb +14 -0
data/lib/baiduserp/parser.rb +18 -141
data/lib/baiduserp/parser/ads_left.rb +5 -0
data/lib/baiduserp/parser/ads_right.rb +5 -0
data/lib/baiduserp/parser/organic.rb +28 -0
data/lib/baiduserp/parser/pinpaizhuanqu.rb +5 -0
data/lib/baiduserp/parser/related_keywords.rb +11 -0
data/lib/baiduserp/parser/result_num.rb +17 -0
data/lib/baiduserp/version.rb +1 -1
metadata +26 -2

data/lib/baiduserp/client.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'httparty'
+module Baiduserp
+  class Client
+    include HTTParty
+    base_uri 'www.baidu.com'
+    follow_redirects false
+  end
+end

data/lib/baiduserp/helper.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Baiduserp
+  module Helper
+    class << self
+      # get content safe from nokogiri search reasult
+      def get_content_safe(noko)
+        return nil if noko.nil?
+        return nil if noko.empty?
+        noko.first.content
+      end
+    end
+  end
+end

data/lib/baiduserp/parser.rb CHANGED Viewed

@@ -1,17 +1,26 @@
 # -*- coding: utf-8 -*-
 require 'nokogiri'
 require 'uri'
-require 'open-uri'
+require 'baiduserp/client'
+require 'baiduserp/helper'
 module Baiduserp
   class Parser
+    Dir[File.expand_path('../parser/*.rb', __FILE__)].each{|f| require f}
     def parse(html)
-      @html = html
-      @doc = Nokogiri::HTML(@html)
-      @results = []
+      @file = Hash.new
       @serp = Hash.new
-      parse_serp_results
-      parse_serp_attrs
+      @file[:html] = html
+      @file[:doc] = Nokogiri::HTML(html)
+      self.class.constants.each do |m|
+        #puts m
+        eval "@serp[:#{m.downcase}] = #{m}.parse @file"
+        #p @serp.keys
+      end
       @serp
     end
@@ -21,144 +30,12 @@ module Baiduserp
     def parse_file(file_path)
       if File.exists? file_path
-        html = open(file_path)
+        html = open(file_path).read
       else
-        html = open(URI.escape(file_path))
+        html = Client.get(URI.escape(file_path)).body
       end
-      html = html.read.encode!('UTF-8','UTF-8',:invalid => :replace)
+      html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
       parse html
     end
-    private
-    def get_content_safe(noko)
-      return nil if noko.nil?
-      return nil if noko.empty?
-      noko.first.content
-    end
-    def parse_serp_results
-      # left side results
-      @doc.search("//table").each do |table|
-        id = table['id'].to_i
-        parse_serp_table(id,table) if id > 0
-      end
-      # right side ads
-      parse_right_side_ads
-      @serp[:serp_results] = @results
-    end
-    def parse_right_side_ads
-      @doc.search("//div[@class='EC_fr EC_PP']").each do |table|
-        id = table['id'].to_s.sub('bdfs','').to_i
-        rank = id + 1
-        url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
-        title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
-        content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
-        @results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
-      end
-    end
-    def get_url_part_from_string(str)
-      str.split(/( |\s)/).each do |s|
-        return s if s.include? '.'
-      end
-      nil
-    end
-    def parse_serp_url(table_id)
-      id = table_id
-      url = nil
-      if id > 3000
-        link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
-      else
-        link_types = ["//table[@id='#{id}']//span[@class='g']",
-                      "//table[@id='#{id}']//font[@color='#008000']",
-                      "//table[@id='#{id}']//span[@style='color:#008000']",
-                      "//table[@id='#{id}']//span[@style='color:#008000;']",
-                      "//table[@id='#{id}']//span[@color='#008000']",
-                      "//table[@id='#{id}']//p[@class='g']",
-                      "//table[@id='#{id}']//cite[@color='#008000']",
-                      "//table[@id='#{id}']//cite",
-                      "//table[@id='#{id}']//span[@id='ala_img_desc']"
-                     ]
-      end
-      link_types.each do |link_type|
-        link_search = @doc.search(link_type)
-        url2 = nil
-        url2 =  get_url_part_from_string(link_search[0].content) if link_search.size > 0
-        if url.nil? && (not url2.nil?)
-          url = url2
-        end
-      end
-      url
-    end
-    def parse_serp_content(id)
-      get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
-    end
-    def parse_serp_table(id,table)
-      result = Hash.new
-      result[:rank] = id
-      result[:url] = parse_serp_url(id)
-      result[:title] = get_content_safe(table.css('h3'))
-      result[:content] = parse_serp_content(id)
-      if id >= 3000 # sem ads
-        result[:paid] = 1
-      else # organic results
-        result[:paid] = 0
-        # baidu open
-        table.css('a').each do |link|
-          result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
-        end
-        # baidu table mu attr (for maps,baike)
-        result[:mu] = table['mu'] unless table['mu'].nil?
-      end
-      @results << result
-    end
-    # parse baidu serp attrs : result_num, baidubrand, related_keywords
-    def parse_serp_attrs
-      @serp[:result_num] = parse_serp_result_num
-      @serp[:baidubrand] = parse_serp_baidu_brand
-      @serp[:related_keywords] = parse_serp_related_search
-    end
-    def parse_serp_related_search
-      result = []
-      @doc.search('div[@id="rs"]').each do |rs|
-        rs.css('a').each do |link|
-          result << link.content
-        end
-      end
-      result
-    end
-    def parse_serp_baidu_brand
-      if @html.include? 'bs.baidu.com/adcoup-mat'
-        result = 1
-      else
-        result = 0
-      end
-      result
-    end
-    def parse_serp_result_num
-      str = @html.scan(/找到相关结果(.*)个/).join
-      str = str.gsub('约','')
-      if str.include?('万')
-        parts = str.split('万')
-        return parts[0].to_i * 10000 + parts[1].to_i
-      end
-      str.gsub(',', '').to_i
-    end
   end
 end

data/lib/baiduserp/parser/ads_left.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Baiduserp::Parser::Ads_Left
+  def self.parse(file)
+  end
+end

data/lib/baiduserp/parser/ads_right.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Baiduserp::Parser::Ads_Right
+  def self.parse(file)
+  end
+end

data/lib/baiduserp/parser/organic.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Baiduserp::Parser::Organic
+  def self.parse(file)
+    result = []
+    file[:doc].search("//table").each do |table|
+      id = table['id'].to_i
+      next unless id > 0
+      r = Hash.new
+      url = table.search("h3/a").first['href']
+      url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
+      r[:url] = url
+      r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))
+      r[:content] = Baiduserp::Helper.get_content_safe(table.search("div[@class='c-abstract']"))
+      r[:mu] = table['mu']
+      table.search('a').each do |link|
+        r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
+      end
+      r[:baiduopen] = false if r[:baiduopen].nil?
+      result << r
+    end
+    result
+  end
+end

data/lib/baiduserp/parser/pinpaizhuanqu.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Baiduserp::Parser::PinPaiZhuanQu
+  def self.parse(file)
+    file[:html].include? 'bs.baidu.com/adcoup-mat'
+  end
+end

data/lib/baiduserp/parser/related_keywords.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Baiduserp::Parser::Related_Keywords
+  def self.parse(file)
+    result = []
+    file[:doc].search('div[@id="rs"]').each do |rs|
+      rs.css('a').each do |link|
+        result << link.content
+      end
+    end
+    result
+  end
+end

data/lib/baiduserp/parser/result_num.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# coding: utf-8
+module Baiduserp::Parser::Result_Num
+  def self.parse(file)
+    html = file[:html]
+    str = html.scan(/找到相关结果(.*)个/).join
+    str = str.gsub('约','')
+    if str.include?('万')
+      parts = str.split('万')
+      result = parts[0].to_i * 10000 + parts[1].to_i
+    else
+      result = str.gsub(',', '').to_i
+    end
+    result
+  end
+end

data/lib/baiduserp/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Baiduserp
-  VERSION = "0.1.1"
+  VERSION = "2.0.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: baiduserp
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 2.0.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-21 00:00:00.000000000 Z
+date: 2013-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: httparty
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Parse Baidu SERP result page.
 email:
 - zmingqian@qq.com
@@ -35,6 +51,14 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/baiduserp/client.rb
+- lib/baiduserp/helper.rb
+- lib/baiduserp/parser/ads_left.rb
+- lib/baiduserp/parser/ads_right.rb
+- lib/baiduserp/parser/organic.rb
+- lib/baiduserp/parser/pinpaizhuanqu.rb
+- lib/baiduserp/parser/related_keywords.rb
+- lib/baiduserp/parser/result_num.rb
 - lib/baiduserp/parser.rb
 - lib/baiduserp/version.rb
 - lib/baiduserp.rb