the_scrap 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -3
- data/examples/news.rb +132 -0
- data/lib/the_scrap/list_obj.rb +8 -3
- data/lib/the_scrap/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8028accaed4377b6cb273ebaf1fcd070efaf726
|
4
|
+
data.tar.gz: 432a41ab802d604176a883009b07865a349f53cb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d6a4cb383cb53c49377f94b4103a20940e7f1daa9d13c48d9421d4f3a33674c78ac72a2567c649093abc96deb79edfaf8b6190e02c7270c164eb753d7551c59
|
7
|
+
data.tar.gz: 35274bba078799c73274c18037ce4239070169e01d7bbdee80d8534ffc7072569a008a0bd5d38af18d1ec37335297c6ddd13c22ff8664266ccca3c9e159fcbad
|
data/README.md
CHANGED
@@ -4,6 +4,40 @@ The Scrap 是一个基于Nokogiri的网页数据抓取的框架
|
|
4
4
|
|
5
5
|
目标是使用简单、高效、高自定义、高适配性。
|
6
6
|
|
7
|
+
## Why
|
8
|
+
|
9
|
+
**网页数据的抓取最基本的工作流程为:**
|
10
|
+
|
11
|
+
1. 确定要抓取的起始URL,如: https://ruby-china.org/topics
|
12
|
+
2. 抓取列表信息,一般列表信息按照tr,li,div,dd等呈现,每个节点为一条记录,如:上述URL中的Css Selector为:".topics .topic"
|
13
|
+
3. 提取记录的相关信息,标题,作者,分类,详细页面的URL等。
|
14
|
+
4. 抓取详细页面信息,一般列表只有部分信息,完整获取需要进入详细页面进行数据提取。
|
15
|
+
5. 数据源有分页的情况还需要循环抓取多页信息。
|
16
|
+
6. 数据加工。
|
17
|
+
7. 数据入库或输出,排重处理等。
|
18
|
+
|
19
|
+
|
20
|
+
**在处理以上任务是往往会遇到如下问题:**
|
21
|
+
|
22
|
+
1. 源HTML无法直接使用,需要进行一些处理
|
23
|
+
2. 抓取的条目需要过滤无效数据。
|
24
|
+
2. 需要对住区的各种URL进行处理,如:连接或者图片往往不是完整的URL,需要通过当前页面地址进行合并处理。
|
25
|
+
3. 提取的数据需要进行特殊处理。还是RubyChina的例子比如帖子阅读次数:".info leader" 下的内容为: "· 618 次阅读",需要的只是:618
|
26
|
+
4. 每个网站都有不同的分页机制,和分页URL的规则,处理起来相当麻烦。
|
27
|
+
5. 输出过程往往需要将之前提取的单个信息组合成一个对象或者Hash等。
|
28
|
+
|
29
|
+
**很久之前使用Perl进行数据抓取,由于个人Perl水平问题和语言上的一些限制,处理起来偏麻烦。后来用了很多Ruby写的框架都不是很满意(Scrubyt应该是我用过的比较不错的一个)**
|
30
|
+
**故根据实际需要慢慢总结形成了现在的方式:**
|
31
|
+
|
32
|
+
1. 定义列表和详细页面抓取规则
|
33
|
+
2. 需要提取的信息和提取规则通过Method missing方式存入Hash中。
|
34
|
+
3. 规则可以根据需要提取不同属性和数据,Link的href和IMG的src自动进行URI.join(current_url)处理
|
35
|
+
4. 实现列表多个节点的Join或者返回Array,如tags。
|
36
|
+
5. 实现多种分页方式支持。
|
37
|
+
6. 自动通过抓取列表数据取得的详细页面地址抓取详细信息,并合并到同一个结果记录中。
|
38
|
+
7. 抓取的结果为一个Hash,适当定义名称可以直接使用各种ORMapping实现进行入库,无需重新组装。
|
39
|
+
7. 使用Ruby的lambda实现Html处理、数据过滤、结果处理等,自定义程度和适应性有所提高。
|
40
|
+
|
7
41
|
## Installation
|
8
42
|
|
9
43
|
Add this line to your application's Gemfile:
|
@@ -56,6 +90,7 @@ scrap.verbose = true
|
|
56
90
|
#html preprocess
|
57
91
|
scrap.html_proc << lambda { |html|
|
58
92
|
#html.gsub(/abcd/,'efgh')
|
93
|
+
html
|
59
94
|
}
|
60
95
|
|
61
96
|
#filter scraped item
|
@@ -108,12 +143,12 @@ scrap.has_many_pages = true
|
|
108
143
|
|
109
144
|
|
110
145
|
#:next_page
|
111
|
-
scrap.
|
146
|
+
scrap.pager_method = :next_page
|
112
147
|
scrap.next_page_css = ".next_page a"
|
113
148
|
|
114
149
|
|
115
150
|
#:total_page
|
116
|
-
scrap.
|
151
|
+
scrap.pager_method = :total_pages
|
117
152
|
scrap.get_page_count = lambda { |doc|
|
118
153
|
if doc.css('.total_p[age').text =~ /(\d+)页/
|
119
154
|
$~[1].to_i
|
@@ -129,7 +164,7 @@ scrap.get_next_url = lambda { |url,next_page_number|
|
|
129
164
|
}
|
130
165
|
|
131
166
|
#**total_record in progress
|
132
|
-
scrap.
|
167
|
+
scrap.pager_method = :total_records
|
133
168
|
#...
|
134
169
|
|
135
170
|
scrap.scrap_list
|
data/examples/news.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'the_scrap'
|
4
|
+
require 'pp'
|
5
|
+
|
6
|
+
#require 'active_record'
|
7
|
+
#require 'mysql2'
|
8
|
+
#require 'activerecord-import'
|
9
|
+
|
10
|
+
#ActiveRecord::Base.establish_connection( :adapter => "mysql2", :host => "localhost",
|
11
|
+
# :database => "test", :username => "root", :password => "" )
|
12
|
+
#
|
13
|
+
##custom update_at,created_at
|
14
|
+
#ActiveRecord::Base.record_timestamps = false
|
15
|
+
#
|
16
|
+
#class Article < ActiveRecord::Base
|
17
|
+
# validates :ori_id,:uniqueness => {:scope => :cat_id}
|
18
|
+
#end
|
19
|
+
|
20
|
+
#create Object
|
21
|
+
scrap = TheScrap::ListObj.new
|
22
|
+
|
23
|
+
#set start url
|
24
|
+
#放完成的抓取上来不是很合适,所以把网址改了。
|
25
|
+
scrap.url = "http://www.xxx.com/news/review/"
|
26
|
+
|
27
|
+
#fragment css selector
|
28
|
+
scrap.item_frag = ".center ul li.content"
|
29
|
+
|
30
|
+
#scrap attr list
|
31
|
+
scrap.attr_title = 'h2 a'
|
32
|
+
scrap.attr_ori_url = ['h2 a','href']
|
33
|
+
scrap.attr_image = ['.detail a img','src']
|
34
|
+
scrap.attr_description = '.detail p'
|
35
|
+
scrap.attr_infos = '.arcilte_info'
|
36
|
+
|
37
|
+
#debug
|
38
|
+
scrap.debug = true
|
39
|
+
scrap.verbose = true
|
40
|
+
|
41
|
+
|
42
|
+
#html preprocess
|
43
|
+
scrap.html_proc << lambda { |html|
|
44
|
+
#html.gsub(/abcd/,'efgh')
|
45
|
+
html
|
46
|
+
}
|
47
|
+
|
48
|
+
#filter scraped item
|
49
|
+
scrap.item_filters << lambda { |item_info|
|
50
|
+
return false if item_info['title'].nil? || item_info['title'].length == 0
|
51
|
+
return true
|
52
|
+
}
|
53
|
+
|
54
|
+
#data process
|
55
|
+
scrap.data_proc << lambda {|url,i|
|
56
|
+
i['title'] = i['title'].strip
|
57
|
+
|
58
|
+
if i['infos'] =~ /日期:(.*?)\S+点击/
|
59
|
+
i['created_at'] = i['updated_at'] = Time.parse($~[1].strip) - 8*3600
|
60
|
+
end
|
61
|
+
i.delete('infos')
|
62
|
+
|
63
|
+
if i['ori_url'] =~ /\d+-\d+-(\d+).html/
|
64
|
+
i[:ori_id] = $~[1].to_i
|
65
|
+
end
|
66
|
+
|
67
|
+
i[:cat_id] = @cat_id
|
68
|
+
i[:source] = 'xxx.com'
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
#result process
|
73
|
+
scrap.result_proc << lambda {|url,items|
|
74
|
+
#articles = []
|
75
|
+
items.each do |item|
|
76
|
+
#articles << Article.new(item)
|
77
|
+
pp item
|
78
|
+
end
|
79
|
+
#Article.import articles
|
80
|
+
}
|
81
|
+
|
82
|
+
########### has many pages ###########
|
83
|
+
#如果设置了可以根据不同的分页方式抓取多页列表
|
84
|
+
|
85
|
+
scrap.has_many_pages = true
|
86
|
+
|
87
|
+
#:next_page
|
88
|
+
scrap.pager_method = :next_page
|
89
|
+
scrap.next_page_css = ".pagenu .next a"
|
90
|
+
|
91
|
+
################# has detail page ################
|
92
|
+
scrap_detail = TheScrap::DetailObj.new
|
93
|
+
scrap_detail.attr_content = [".ar_in_cont_3",:inner_html]
|
94
|
+
|
95
|
+
#data process
|
96
|
+
scrap_detail.data_proc << lambda {|url,i|
|
97
|
+
content = i['content'].encode('utf-8')
|
98
|
+
|
99
|
+
regex = %q{<a href="http://www\.xxx\.com/" target="_blank"><u>(.*?)</u></a>}
|
100
|
+
content.gsub!(/#{regex}/,'\1')
|
101
|
+
|
102
|
+
content.gsub!(/<div class="context">.*/m,'')
|
103
|
+
content.gsub!(/style=".*?width:.*?\d+px; height:.*?\d+px;.*?"/,'')
|
104
|
+
|
105
|
+
regex = %q{<a href="http://www.xxx.com/.*?" target="_blank">.*?</a>}
|
106
|
+
content.gsub!(/#{regex}/m,'')
|
107
|
+
|
108
|
+
i['content'] = content.strip
|
109
|
+
}
|
110
|
+
|
111
|
+
|
112
|
+
#get url from list attr and extra data by scrap_detail
|
113
|
+
scrap.detail_info << [scrap_detail,'ori_url']
|
114
|
+
|
115
|
+
scrap_detail.encoding = 'gbk'
|
116
|
+
scrap.encoding = 'gbk'
|
117
|
+
|
118
|
+
|
119
|
+
#scrap
|
120
|
+
[
|
121
|
+
{url:'http://www.xxx.com/news/review/',cat_id:1},
|
122
|
+
{url:'http://www.xxx.com/news/yejie/',cat_id:2},
|
123
|
+
{url:'http://www.xxx.com/analysis/',cat_id:3},
|
124
|
+
].each do |item|
|
125
|
+
|
126
|
+
scrap.url = item[:url]
|
127
|
+
puts "start url:#{scrap.url}"
|
128
|
+
@cat_id = item[:cat_id]
|
129
|
+
scrap.scrap_list
|
130
|
+
|
131
|
+
end
|
132
|
+
|
data/lib/the_scrap/list_obj.rb
CHANGED
@@ -65,17 +65,22 @@ module TheScrap
|
|
65
65
|
scrap(url)
|
66
66
|
end
|
67
67
|
|
68
|
-
return unless
|
68
|
+
return unless @has_many_pages
|
69
69
|
|
70
70
|
#TODO Refactor it
|
71
71
|
next_page_url = nil
|
72
|
-
|
72
|
+
prev_page_url = nil
|
73
|
+
if @pager_method == :next_page #有下一页连接的方式
|
73
74
|
while node = doc.css(next_page_css).first
|
74
75
|
next_page_url = URI.join(next_page_url||url,node['href']).to_s
|
75
|
-
|
76
|
+
break if prev_page_url == next_page_url
|
77
|
+
|
78
|
+
puts "url: #{next_page_url}" if verbose?
|
76
79
|
doc,items = retryable(:tries => 3, :on => Timeout::Error) do
|
77
80
|
scrap(next_page_url)
|
78
81
|
end
|
82
|
+
|
83
|
+
prev_page_url = next_page_url
|
79
84
|
break if items.count == 0
|
80
85
|
break if debug?
|
81
86
|
end
|
data/lib/the_scrap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: the_scrap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- H.J.LeoChen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- LICENSE.txt
|
65
65
|
- README.md
|
66
66
|
- Rakefile
|
67
|
+
- examples/news.rb
|
67
68
|
- lib/the_scrap.rb
|
68
69
|
- lib/the_scrap/detail_obj.rb
|
69
70
|
- lib/the_scrap/list_obj.rb
|