RubyGems - the_scrap - Versions diffs - 0.0.1 → 0.0.2 - Mend

the_scrap 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3b6dbb1e2bbe11284969c7a5bcc79e3bba665b96
-  data.tar.gz: acfc7d5ac75f238fc77c578b52a68954e71b64c7
+  metadata.gz: b8028accaed4377b6cb273ebaf1fcd070efaf726
+  data.tar.gz: 432a41ab802d604176a883009b07865a349f53cb
 SHA512:
-  metadata.gz: b14dd5d813c97c2a4c9f8c954d8d523004b3d3472e502c8e881301d4213d1539e417da1fb19ca0b864335356065e9b23d8ec8f546b4e464649f92a0814005ec9
-  data.tar.gz: c989f8ccca09cdef3f892ca78264f9c53c8ef52beaa0020e3482109625e92a14f6e3a49c0b64131cb7a174688ce4734272e0b8c59cd6052ee98e130e3c5fc4b0
+  metadata.gz: 3d6a4cb383cb53c49377f94b4103a20940e7f1daa9d13c48d9421d4f3a33674c78ac72a2567c649093abc96deb79edfaf8b6190e02c7270c164eb753d7551c59
+  data.tar.gz: 35274bba078799c73274c18037ce4239070169e01d7bbdee80d8534ffc7072569a008a0bd5d38af18d1ec37335297c6ddd13c22ff8664266ccca3c9e159fcbad

data/README.md CHANGED

@@ -4,6 +4,40 @@ The Scrap 是一个基于Nokogiri的网页数据抓取的框架
 目标是使用简单、高效、高自定义、高适配性。
+## Why
+**网页数据的抓取最基本的工作流程为：**
+1. 确定要抓取的起始URL，如: https://ruby-china.org/topics
+2. 抓取列表信息，一般列表信息按照tr,li,div,dd等呈现，每个节点为一条记录，如：上述URL中的Css Selector为：".topics .topic"
+3. 提取记录的相关信息，标题，作者，分类，详细页面的URL等。
+4. 抓取详细页面信息，一般列表只有部分信息，完整获取需要进入详细页面进行数据提取。
+5. 数据源有分页的情况还需要循环抓取多页信息。
+6. 数据加工。
+7. 数据入库或输出,排重处理等。
+**在处理以上任务是往往会遇到如下问题：**
+1. 源HTML无法直接使用，需要进行一些处理
+2. 抓取的条目需要过滤无效数据。
+2. 需要对住区的各种URL进行处理，如：连接或者图片往往不是完整的URL，需要通过当前页面地址进行合并处理。
+3. 提取的数据需要进行特殊处理。还是RubyChina的例子比如帖子阅读次数：".info leader" 下的内容为：	"· 618 次阅读",需要的只是：618
+4. 每个网站都有不同的分页机制，和分页URL的规则，处理起来相当麻烦。
+5. 输出过程往往需要将之前提取的单个信息组合成一个对象或者Hash等。
+**很久之前使用Perl进行数据抓取，由于个人Perl水平问题和语言上的一些限制，处理起来偏麻烦。后来用了很多Ruby写的框架都不是很满意(Scrubyt应该是我用过的比较不错的一个）**
+**故根据实际需要慢慢总结形成了现在的方式：**
+1. 定义列表和详细页面抓取规则
+2. 需要提取的信息和提取规则通过Method missing方式存入Hash中。
+3. 规则可以根据需要提取不同属性和数据，Link的href和IMG的src自动进行URI.join(current_url)处理
+4. 实现列表多个节点的Join或者返回Array,如tags。
+5. 实现多种分页方式支持。
+6. 自动通过抓取列表数据取得的详细页面地址抓取详细信息，并合并到同一个结果记录中。
+7. 抓取的结果为一个Hash，适当定义名称可以直接使用各种ORMapping实现进行入库，无需重新组装。
+7. 使用Ruby的lambda实现Html处理、数据过滤、结果处理等，自定义程度和适应性有所提高。
 ## Installation
 Add this line to your application's Gemfile:
@@ -56,6 +90,7 @@ scrap.verbose = true
 #html preprocess
 scrap.html_proc << lambda { |html|
   #html.gsub(/abcd/,'efgh')
+	html
 }
 #filter scraped item
@@ -108,12 +143,12 @@ scrap.has_many_pages = true
 #:next_page
-scrap.page_method = :next_page
+scrap.pager_method = :next_page
 scrap.next_page_css = ".next_page a"
 #:total_page
-scrap.page_method = :total_pages
+scrap.pager_method = :total_pages
 scrap.get_page_count = lambda { |doc|
   if doc.css('.total_p[age').text =~ /(\d+)页/
     $~[1].to_i
@@ -129,7 +164,7 @@ scrap.get_next_url = lambda { |url,next_page_number|
 }
 #**total_record in progress
-scrap.page_method = :total_records
+scrap.pager_method = :total_records
 #...
 scrap.scrap_list

data/examples/news.rb ADDED

@@ -0,0 +1,132 @@
+# encoding: utf-8
+require 'rubygems'
+require 'the_scrap'
+require 'pp'
+#require 'active_record'
+#require 'mysql2'
+#require 'activerecord-import'
+#ActiveRecord::Base.establish_connection( :adapter => "mysql2",  :host => "localhost",
+# :database => "test", :username => "root", :password => ""  )
+#
+##custom update_at,created_at
+#ActiveRecord::Base.record_timestamps = false
+#
+#class Article < ActiveRecord::Base
+#  validates :ori_id,:uniqueness => {:scope => :cat_id}
+#end
+#create Object
+scrap = TheScrap::ListObj.new
+#set start url
+#放完成的抓取上来不是很合适，所以把网址改了。
+scrap.url = "http://www.xxx.com/news/review/"
+#fragment css selector
+scrap.item_frag = ".center ul li.content"
+#scrap attr list
+scrap.attr_title = 'h2 a'
+scrap.attr_ori_url = ['h2 a','href']
+scrap.attr_image = ['.detail a img','src']
+scrap.attr_description = '.detail p'
+scrap.attr_infos = '.arcilte_info'
+#debug
+scrap.debug = true
+scrap.verbose = true
+#html preprocess
+scrap.html_proc << lambda { |html|
+  #html.gsub(/abcd/,'efgh')
+  html
+}
+#filter scraped item
+scrap.item_filters << lambda { |item_info|
+  return false if item_info['title'].nil? || item_info['title'].length == 0
+  return true
+}
+#data process
+scrap.data_proc << lambda {|url,i|
+  i['title'] = i['title'].strip
+  if i['infos'] =~ /日期:(.*?)\S+点击/
+    i['created_at'] = i['updated_at'] =  Time.parse($~[1].strip) - 8*3600
+  end
+  i.delete('infos')
+  if i['ori_url'] =~ /\d+-\d+-(\d+).html/
+    i[:ori_id] =  $~[1].to_i
+  end
+  i[:cat_id] = @cat_id
+  i[:source] = 'xxx.com'
+}
+#result process
+scrap.result_proc << lambda {|url,items|
+  #articles = []
+  items.each do |item|
+    #articles << Article.new(item)
+    pp item
+  end
+  #Article.import articles
+}
+########### has many pages ###########
+#如果设置了可以根据不同的分页方式抓取多页列表
+scrap.has_many_pages = true
+#:next_page
+scrap.pager_method = :next_page
+scrap.next_page_css = ".pagenu .next a"
+################# has detail page ################
+scrap_detail = TheScrap::DetailObj.new
+scrap_detail.attr_content = [".ar_in_cont_3",:inner_html]
+#data process
+scrap_detail.data_proc << lambda {|url,i|
+  content = i['content'].encode('utf-8')
+  regex = %q{<a href="http://www\.xxx\.com/" target="_blank"><u>(.*?)</u></a>}
+  content.gsub!(/#{regex}/,'\1')
+  content.gsub!(/<div class="context">.*/m,'')
+  content.gsub!(/style=".*?width:.*?\d+px; height:.*?\d+px;.*?"/,'')
+  regex = %q{<a href="http://www.xxx.com/.*?" target="_blank">.*?</a>}
+  content.gsub!(/#{regex}/m,'')
+  i['content'] = content.strip
+}
+#get url from list attr and extra data by scrap_detail
+scrap.detail_info << [scrap_detail,'ori_url']
+scrap_detail.encoding = 'gbk'
+scrap.encoding = 'gbk'
+#scrap
+[
+  {url:'http://www.xxx.com/news/review/',cat_id:1},
+  {url:'http://www.xxx.com/news/yejie/',cat_id:2},
+  {url:'http://www.xxx.com/analysis/',cat_id:3},
+].each do |item|
+  scrap.url = item[:url]
+  puts "start url:#{scrap.url}"
+  @cat_id = item[:cat_id]
+  scrap.scrap_list
+end

data/lib/the_scrap/list_obj.rb CHANGED

@@ -65,17 +65,22 @@ module TheScrap
         scrap(url)
       end
-      return unless has_many_pages
+      return unless  @has_many_pages
       #TODO Refactor it
       next_page_url = nil
-      if pager_method == :next_page #有下一页连接的方式
+      prev_page_url = nil
+      if @pager_method == :next_page #有下一页连接的方式
         while node = doc.css(next_page_css).first
           next_page_url = URI.join(next_page_url||url,node['href']).to_s
-          puts next_page_url if verbose?
+          break if prev_page_url == next_page_url
+          puts "url: #{next_page_url}" if verbose?
           doc,items = retryable(:tries => 3, :on => Timeout::Error) do
             scrap(next_page_url)
           end
+          prev_page_url = next_page_url
           break if items.count == 0
           break if debug?
         end

data/lib/the_scrap/version.rb CHANGED

@@ -1,3 +1,3 @@
 module TheScrap
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: the_scrap
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - H.J.LeoChen
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-08-18 00:00:00.000000000 Z
+date: 2014-09-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -64,6 +64,7 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- examples/news.rb
 - lib/the_scrap.rb
 - lib/the_scrap/detail_obj.rb
 - lib/the_scrap/list_obj.rb