RubyGems - the_scrap - Versions diffs - 0.0.1 → 0.0.2 - Mend

the_scrap 0.0.1 → 0.0.2

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3b6dbb1e2bbe11284969c7a5bcc79e3bba665b96
-  data.tar.gz: acfc7d5ac75f238fc77c578b52a68954e71b64c7
+  metadata.gz: b8028accaed4377b6cb273ebaf1fcd070efaf726
+  data.tar.gz: 432a41ab802d604176a883009b07865a349f53cb
 SHA512:
-  metadata.gz: b14dd5d813c97c2a4c9f8c954d8d523004b3d3472e502c8e881301d4213d1539e417da1fb19ca0b864335356065e9b23d8ec8f546b4e464649f92a0814005ec9
-  data.tar.gz: c989f8ccca09cdef3f892ca78264f9c53c8ef52beaa0020e3482109625e92a14f6e3a49c0b64131cb7a174688ce4734272e0b8c59cd6052ee98e130e3c5fc4b0
+  metadata.gz: 3d6a4cb383cb53c49377f94b4103a20940e7f1daa9d13c48d9421d4f3a33674c78ac72a2567c649093abc96deb79edfaf8b6190e02c7270c164eb753d7551c59
+  data.tar.gz: 35274bba078799c73274c18037ce4239070169e01d7bbdee80d8534ffc7072569a008a0bd5d38af18d1ec37335297c6ddd13c22ff8664266ccca3c9e159fcbad

data/README.md CHANGED

@@ -4,6 +4,40 @@ The Scrap 是一个基于Nokogiri的网页数据抓取的框架
 目标是使用简单、高效、高自定义、高适配性。
+## Why
+**网页数据的抓取最基本的工作流程为：**
+1. 确定要抓取的起始URL，如: https://ruby-china.org/topics
+2. 抓取列表信息，一般列表信息按照tr,li,div,dd等呈现，每个节点为一条记录，如：上述URL中的Css Selector为：".topics .topic"
+3. 提取记录的相关信息，标题，作者，分类，详细页面的URL等。
+4. 抓取详细页面信息，一般列表只有部分信息，完整获取需要进入详细页面进行数据提取。
+5. 数据源有分页的情况还需要循环抓取多页信息。
+6. 数据加工。
+7. 数据入库或输出,排重处理等。
+**在处理以上任务是往往会遇到如下问题：**
+1. 源HTML无法直接使用，需要进行一些处理
+2. 抓取的条目需要过滤无效数据。
+2. 需要对住区的各种URL进行处理，如：连接或者图片往往不是完整的URL，需要通过当前页面地址进行合并处理。
+3. 提取的数据需要进行特殊处理。还是RubyChina的例子比如帖子阅读次数：".info leader" 下的内容为：	"· 618 次阅读",需要的只是：618
+4. 每个网站都有不同的分页机制，和分页URL的规则，处理起来相当麻烦。
+5. 输出过程往往需要将之前提取的单个信息组合成一个对象或者Hash等。
+**很久之前使用Perl进行数据抓取，由于个人Perl水平问题和语言上的一些限制，处理起来偏麻烦。后来用了很多Ruby写的框架都不是很满意(Scrubyt应该是我用过的比较不错的一个）**
+**故根据实际需要慢慢总结形成了现在的方式：**
+1. 定义列表和详细页面抓取规则
+2. 需要提取的信息和提取规则通过Method missing方式存入Hash中。
+3. 规则可以根据需要提取不同属性和数据，Link的href和IMG的src自动进行URI.join(current_url)处理
+4. 实现列表多个节点的Join或者返回Array,如tags。
+5. 实现多种分页方式支持。
+6. 自动通过抓取列表数据取得的详细页面地址抓取详细信息，并合并到同一个结果记录中。
+7. 抓取的结果为一个Hash，适当定义名称可以直接使用各种ORMapping实现进行入库，无需重新组装。
+7. 使用Ruby的lambda实现Html处理、数据过滤、结果处理等，自定义程度和适应性有所提高。
 ## Installation
 Add this line to your application's Gemfile:
@@ -56,6 +90,7 @@ scrap.verbose = true
 #html preprocess
 scrap.html_proc << lambda { |html|
   #html.gsub(/abcd/,'efgh')
+	html
 }
 #filter scraped item
@@ -108,12 +143,12 @@ scrap.has_many_pages = true
 #:next_page
-scrap.page_method = :next_page
+scrap.pager_method = :next_page
 scrap.next_page_css = ".next_page a"
 #:total_page
-scrap.page_method = :total_pages
+scrap.pager_method = :total_pages
 scrap.get_page_count = lambda { |doc|
   if doc.css('.total_p[age').text =~ /(\d+)页/
     $~[1].to_i
@@ -129,7 +164,7 @@ scrap.get_next_url = lambda { |url,next_page_number|
 }
 #**total_record in progress
-scrap.page_method = :total_records
+scrap.pager_method = :total_records
 #...
 scrap.scrap_list

data/examples/news.rb ADDED

@@ -0,0 +1,132 @@
+# encoding: utf-8
+require 'rubygems'
+require 'the_scrap'
+require 'pp'
+#require 'active_record'
+#require 'mysql2'
+#require 'activerecord-import'
+#ActiveRecord::Base.establish_connection( :adapter => "mysql2",  :host => "localhost",
+# :database => "test", :username => "root", :password => ""  )
+#
+##custom update_at,created_at
+#ActiveRecord::Base.record_timestamps = false
+#
+#class Article < ActiveRecord::Base
+#  validates :ori_id,:uniqueness => {:scope => :cat_id}
+#end
+#create Object
+scrap = TheScrap::ListObj.new
+#set start url
+#放完成的抓取上来不是很合适，所以把网址改了。
+scrap.url = "http://www.xxx.com/news/review/"
+#fragment css selector
+scrap.item_frag = ".center ul li.content"
+#scrap attr list
+scrap.attr_title = 'h2 a'
+scrap.attr_ori_url = ['h2 a','href']
+scrap.attr_image = ['.detail a img','src']
+scrap.attr_description = '.detail p'
+scrap.attr_infos = '.arcilte_info'
+#debug
+scrap.debug = true
+scrap.verbose = true
+#html preprocess
+scrap.html_proc << lambda { |html|
+  #html.gsub(/abcd/,'efgh')
+  html
+}
+#filter scraped item
+scrap.item_filters << lambda { |item_info|
+  return false if item_info['title'].nil? || item_info['title'].length == 0
+  return true
+}
+#data process
+scrap.data_proc << lambda {|url,i|
+  i['title'] = i['title'].strip
+  if i['infos'] =~ /日期:(.*?)\S+点击/
+    i['created_at'] = i['updated_at'] =  Time.parse($~[1].strip) - 8*3600
+  end
+  i.delete('infos')
+  if i['ori_url'] =~ /\d+-\d+-(\d+).html/
+    i[:ori_id] =  $~[1].to_i
+  end
+  i[:cat_id] = @cat_id
+  i[:source] = 'xxx.com'
+}
+#result process
+scrap.result_proc << lambda {|url,items|
+  #articles = []
+  items.each do |item|
+    #articles << Article.new(item)
+    pp item
+  end
+  #Article.import articles
+}
+########### has many pages ###########
+#如果设置了可以根据不同的分页方式抓取多页列表
+scrap.has_many_pages = true
+#:next_page
+scrap.pager_method = :next_page
+scrap.next_page_css = ".pagenu .next a"
+################# has detail page ################
+scrap_detail = TheScrap::DetailObj.new
+scrap_detail.attr_content = [".ar_in_cont_3",:inner_html]
+#data process
+scrap_detail.data_proc << lambda {|url,i|
+  content = i['content'].encode('utf-8')
+  regex = %q{<a href="http://www\.xxx\.com/" target="_blank"><u>(.*?)</u></a>}
+  content.gsub!(/#{regex}/,'\1')
+  content.gsub!(/<div class="context">.*/m,'')
+  content.gsub!(/style=".*?width:.*?\d+px; height:.*?\d+px;.*?"/,'')
+  regex = %q{<a href="http://www.xxx.com/.*?" target="_blank">.*?</a>}
+  content.gsub!(/#{regex}/m,'')
+  i['content'] = content.strip
+}
+#get url from list attr and extra data by scrap_detail
+scrap.detail_info << [scrap_detail,'ori_url']
+scrap_detail.encoding = 'gbk'
+scrap.encoding = 'gbk'
+#scrap
+[
+  {url:'http://www.xxx.com/news/review/',cat_id:1},
+  {url:'http://www.xxx.com/news/yejie/',cat_id:2},
+  {url:'http://www.xxx.com/analysis/',cat_id:3},
+].each do |item|
+  scrap.url = item[:url]
+  puts "start url:#{scrap.url}"
+  @cat_id = item[:cat_id]
+  scrap.scrap_list
+end

data/lib/the_scrap/list_obj.rb CHANGED

@@ -65,17 +65,22 @@ module TheScrap
         scrap(url)
       end
-      return unless has_many_pages
+      return unless  @has_many_pages
       #TODO Refactor it
       next_page_url = nil
-      if pager_method == :next_page #有下一页连接的方式
+      prev_page_url = nil
+      if @pager_method == :next_page #有下一页连接的方式
         while node = doc.css(next_page_css).first
           next_page_url = URI.join(next_page_url||url,node['href']).to_s
-          puts next_page_url if verbose?
+          break if prev_page_url == next_page_url
+          puts "url: #{next_page_url}" if verbose?
           doc,items = retryable(:tries => 3, :on => Timeout::Error) do
             scrap(next_page_url)
           end
+          prev_page_url = next_page_url
           break if items.count == 0
           break if debug?
         end

data/lib/the_scrap/version.rb CHANGED

@@ -1,3 +1,3 @@
 module TheScrap
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: the_scrap
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - H.J.LeoChen
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-08-18 00:00:00.000000000 Z
+date: 2014-09-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -64,6 +64,7 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- examples/news.rb
 - lib/the_scrap.rb
 - lib/the_scrap/detail_obj.rb
 - lib/the_scrap/list_obj.rb