the_scrap 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3b6dbb1e2bbe11284969c7a5bcc79e3bba665b96
4
- data.tar.gz: acfc7d5ac75f238fc77c578b52a68954e71b64c7
3
+ metadata.gz: b8028accaed4377b6cb273ebaf1fcd070efaf726
4
+ data.tar.gz: 432a41ab802d604176a883009b07865a349f53cb
5
5
  SHA512:
6
- metadata.gz: b14dd5d813c97c2a4c9f8c954d8d523004b3d3472e502c8e881301d4213d1539e417da1fb19ca0b864335356065e9b23d8ec8f546b4e464649f92a0814005ec9
7
- data.tar.gz: c989f8ccca09cdef3f892ca78264f9c53c8ef52beaa0020e3482109625e92a14f6e3a49c0b64131cb7a174688ce4734272e0b8c59cd6052ee98e130e3c5fc4b0
6
+ metadata.gz: 3d6a4cb383cb53c49377f94b4103a20940e7f1daa9d13c48d9421d4f3a33674c78ac72a2567c649093abc96deb79edfaf8b6190e02c7270c164eb753d7551c59
7
+ data.tar.gz: 35274bba078799c73274c18037ce4239070169e01d7bbdee80d8534ffc7072569a008a0bd5d38af18d1ec37335297c6ddd13c22ff8664266ccca3c9e159fcbad
data/README.md CHANGED
@@ -4,6 +4,40 @@ The Scrap 是一个基于Nokogiri的网页数据抓取的框架
4
4
 
5
5
  目标是使用简单、高效、高自定义、高适配性。
6
6
 
7
+ ## Why
8
+
9
+ **网页数据的抓取最基本的工作流程为:**
10
+
11
+ 1. 确定要抓取的起始URL,如: https://ruby-china.org/topics
12
+ 2. 抓取列表信息,一般列表信息按照tr,li,div,dd等呈现,每个节点为一条记录,如:上述URL中的Css Selector为:".topics .topic"
13
+ 3. 提取记录的相关信息,标题,作者,分类,详细页面的URL等。
14
+ 4. 抓取详细页面信息,一般列表只有部分信息,完整获取需要进入详细页面进行数据提取。
15
+ 5. 数据源有分页的情况还需要循环抓取多页信息。
16
+ 6. 数据加工。
17
+ 7. 数据入库或输出,排重处理等。
18
+
19
+
20
+ **在处理以上任务是往往会遇到如下问题:**
21
+
22
+ 1. 源HTML无法直接使用,需要进行一些处理
23
+ 2. 抓取的条目需要过滤无效数据。
24
+ 2. 需要对住区的各种URL进行处理,如:连接或者图片往往不是完整的URL,需要通过当前页面地址进行合并处理。
25
+ 3. 提取的数据需要进行特殊处理。还是RubyChina的例子比如帖子阅读次数:".info leader" 下的内容为: "· 618 次阅读",需要的只是:618
26
+ 4. 每个网站都有不同的分页机制,和分页URL的规则,处理起来相当麻烦。
27
+ 5. 输出过程往往需要将之前提取的单个信息组合成一个对象或者Hash等。
28
+
29
+ **很久之前使用Perl进行数据抓取,由于个人Perl水平问题和语言上的一些限制,处理起来偏麻烦。后来用了很多Ruby写的框架都不是很满意(Scrubyt应该是我用过的比较不错的一个)**
30
+ **故根据实际需要慢慢总结形成了现在的方式:**
31
+
32
+ 1. 定义列表和详细页面抓取规则
33
+ 2. 需要提取的信息和提取规则通过Method missing方式存入Hash中。
34
+ 3. 规则可以根据需要提取不同属性和数据,Link的href和IMG的src自动进行URI.join(current_url)处理
35
+ 4. 实现列表多个节点的Join或者返回Array,如tags。
36
+ 5. 实现多种分页方式支持。
37
+ 6. 自动通过抓取列表数据取得的详细页面地址抓取详细信息,并合并到同一个结果记录中。
38
+ 7. 抓取的结果为一个Hash,适当定义名称可以直接使用各种ORMapping实现进行入库,无需重新组装。
39
+ 7. 使用Ruby的lambda实现Html处理、数据过滤、结果处理等,自定义程度和适应性有所提高。
40
+
7
41
  ## Installation
8
42
 
9
43
  Add this line to your application's Gemfile:
@@ -56,6 +90,7 @@ scrap.verbose = true
56
90
  #html preprocess
57
91
  scrap.html_proc << lambda { |html|
58
92
  #html.gsub(/abcd/,'efgh')
93
+ html
59
94
  }
60
95
 
61
96
  #filter scraped item
@@ -108,12 +143,12 @@ scrap.has_many_pages = true
108
143
 
109
144
 
110
145
  #:next_page
111
- scrap.page_method = :next_page
146
+ scrap.pager_method = :next_page
112
147
  scrap.next_page_css = ".next_page a"
113
148
 
114
149
 
115
150
  #:total_page
116
- scrap.page_method = :total_pages
151
+ scrap.pager_method = :total_pages
117
152
  scrap.get_page_count = lambda { |doc|
118
153
  if doc.css('.total_p[age').text =~ /(\d+)页/
119
154
  $~[1].to_i
@@ -129,7 +164,7 @@ scrap.get_next_url = lambda { |url,next_page_number|
129
164
  }
130
165
 
131
166
  #**total_record in progress
132
- scrap.page_method = :total_records
167
+ scrap.pager_method = :total_records
133
168
  #...
134
169
 
135
170
  scrap.scrap_list
@@ -0,0 +1,132 @@
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+ require 'the_scrap'
4
+ require 'pp'
5
+
6
+ #require 'active_record'
7
+ #require 'mysql2'
8
+ #require 'activerecord-import'
9
+
10
+ #ActiveRecord::Base.establish_connection( :adapter => "mysql2", :host => "localhost",
11
+ # :database => "test", :username => "root", :password => "" )
12
+ #
13
+ ##custom update_at,created_at
14
+ #ActiveRecord::Base.record_timestamps = false
15
+ #
16
+ #class Article < ActiveRecord::Base
17
+ # validates :ori_id,:uniqueness => {:scope => :cat_id}
18
+ #end
19
+
20
+ #create Object
21
+ scrap = TheScrap::ListObj.new
22
+
23
+ #set start url
24
+ #放完成的抓取上来不是很合适,所以把网址改了。
25
+ scrap.url = "http://www.xxx.com/news/review/"
26
+
27
+ #fragment css selector
28
+ scrap.item_frag = ".center ul li.content"
29
+
30
+ #scrap attr list
31
+ scrap.attr_title = 'h2 a'
32
+ scrap.attr_ori_url = ['h2 a','href']
33
+ scrap.attr_image = ['.detail a img','src']
34
+ scrap.attr_description = '.detail p'
35
+ scrap.attr_infos = '.arcilte_info'
36
+
37
+ #debug
38
+ scrap.debug = true
39
+ scrap.verbose = true
40
+
41
+
42
+ #html preprocess
43
+ scrap.html_proc << lambda { |html|
44
+ #html.gsub(/abcd/,'efgh')
45
+ html
46
+ }
47
+
48
+ #filter scraped item
49
+ scrap.item_filters << lambda { |item_info|
50
+ return false if item_info['title'].nil? || item_info['title'].length == 0
51
+ return true
52
+ }
53
+
54
+ #data process
55
+ scrap.data_proc << lambda {|url,i|
56
+ i['title'] = i['title'].strip
57
+
58
+ if i['infos'] =~ /日期:(.*?)\S+点击/
59
+ i['created_at'] = i['updated_at'] = Time.parse($~[1].strip) - 8*3600
60
+ end
61
+ i.delete('infos')
62
+
63
+ if i['ori_url'] =~ /\d+-\d+-(\d+).html/
64
+ i[:ori_id] = $~[1].to_i
65
+ end
66
+
67
+ i[:cat_id] = @cat_id
68
+ i[:source] = 'xxx.com'
69
+
70
+ }
71
+
72
+ #result process
73
+ scrap.result_proc << lambda {|url,items|
74
+ #articles = []
75
+ items.each do |item|
76
+ #articles << Article.new(item)
77
+ pp item
78
+ end
79
+ #Article.import articles
80
+ }
81
+
82
+ ########### has many pages ###########
83
+ #如果设置了可以根据不同的分页方式抓取多页列表
84
+
85
+ scrap.has_many_pages = true
86
+
87
+ #:next_page
88
+ scrap.pager_method = :next_page
89
+ scrap.next_page_css = ".pagenu .next a"
90
+
91
+ ################# has detail page ################
92
+ scrap_detail = TheScrap::DetailObj.new
93
+ scrap_detail.attr_content = [".ar_in_cont_3",:inner_html]
94
+
95
+ #data process
96
+ scrap_detail.data_proc << lambda {|url,i|
97
+ content = i['content'].encode('utf-8')
98
+
99
+ regex = %q{<a href="http://www\.xxx\.com/" target="_blank"><u>(.*?)</u></a>}
100
+ content.gsub!(/#{regex}/,'\1')
101
+
102
+ content.gsub!(/<div class="context">.*/m,'')
103
+ content.gsub!(/style=".*?width:.*?\d+px; height:.*?\d+px;.*?"/,'')
104
+
105
+ regex = %q{<a href="http://www.xxx.com/.*?" target="_blank">.*?</a>}
106
+ content.gsub!(/#{regex}/m,'')
107
+
108
+ i['content'] = content.strip
109
+ }
110
+
111
+
112
+ #get url from list attr and extra data by scrap_detail
113
+ scrap.detail_info << [scrap_detail,'ori_url']
114
+
115
+ scrap_detail.encoding = 'gbk'
116
+ scrap.encoding = 'gbk'
117
+
118
+
119
+ #scrap
120
+ [
121
+ {url:'http://www.xxx.com/news/review/',cat_id:1},
122
+ {url:'http://www.xxx.com/news/yejie/',cat_id:2},
123
+ {url:'http://www.xxx.com/analysis/',cat_id:3},
124
+ ].each do |item|
125
+
126
+ scrap.url = item[:url]
127
+ puts "start url:#{scrap.url}"
128
+ @cat_id = item[:cat_id]
129
+ scrap.scrap_list
130
+
131
+ end
132
+
@@ -65,17 +65,22 @@ module TheScrap
65
65
  scrap(url)
66
66
  end
67
67
 
68
- return unless has_many_pages
68
+ return unless @has_many_pages
69
69
 
70
70
  #TODO Refactor it
71
71
  next_page_url = nil
72
- if pager_method == :next_page #有下一页连接的方式
72
+ prev_page_url = nil
73
+ if @pager_method == :next_page #有下一页连接的方式
73
74
  while node = doc.css(next_page_css).first
74
75
  next_page_url = URI.join(next_page_url||url,node['href']).to_s
75
- puts next_page_url if verbose?
76
+ break if prev_page_url == next_page_url
77
+
78
+ puts "url: #{next_page_url}" if verbose?
76
79
  doc,items = retryable(:tries => 3, :on => Timeout::Error) do
77
80
  scrap(next_page_url)
78
81
  end
82
+
83
+ prev_page_url = next_page_url
79
84
  break if items.count == 0
80
85
  break if debug?
81
86
  end
@@ -1,3 +1,3 @@
1
1
  module TheScrap
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: the_scrap
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - H.J.LeoChen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-18 00:00:00.000000000 Z
11
+ date: 2014-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -64,6 +64,7 @@ files:
64
64
  - LICENSE.txt
65
65
  - README.md
66
66
  - Rakefile
67
+ - examples/news.rb
67
68
  - lib/the_scrap.rb
68
69
  - lib/the_scrap/detail_obj.rb
69
70
  - lib/the_scrap/list_obj.rb