spider_bot 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.yardopts +2 -0
- data/Gemfile +10 -0
- data/LICENSE.txt +22 -0
- data/README.md +148 -0
- data/Rakefile +2 -0
- data/bin/spider +12 -0
- data/lib/spider_bot/base.rb +31 -0
- data/lib/spider_bot/cli.rb +183 -0
- data/lib/spider_bot/crawl.rb +235 -0
- data/lib/spider_bot/error.rb +5 -0
- data/lib/spider_bot/http/client.rb +166 -0
- data/lib/spider_bot/http/response.rb +83 -0
- data/lib/spider_bot/load.rb +30 -0
- data/lib/spider_bot/logging.rb +21 -0
- data/lib/spider_bot/railte.rb +6 -0
- data/lib/spider_bot/string/date.yml +29 -0
- data/lib/spider_bot/string/time.rb +119 -0
- data/lib/spider_bot/version.rb +3 -0
- data/lib/spider_bot.rb +37 -0
- data/spider_bot.gemspec +32 -0
- metadata +206 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6c8009ebd495001ed425bd236e2b894d1b3124dd
|
4
|
+
data.tar.gz: 97e5ce361bc6165956fac9f879c526ffb7da922d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 951b3f8a49ec3034b6b3031a40270eea9fcb64f1685f1982878523005ae292c0f45dbee000f67bc67ecd16a4a569b39c166d5be5559503ecdde2daa2e4ece00e
|
7
|
+
data.tar.gz: 26d664fb1edc26e8edcd8c5aab9f38cd4bc48b3618c098384e000c71aaa7f8f2c357e8dd447c87940d55f64c40ff73db84a01b356e09cbc88ec8c36a8a6f8818
|
data/.gitignore
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 yee.li
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
# SpiderBot
|
2
|
+
|
3
|
+
一个简单的机器爬虫
|
4
|
+
|
5
|
+
## SpiderBot 安装
|
6
|
+
|
7
|
+
将下列文字添加到你程序中的Gemfile里
|
8
|
+
|
9
|
+
```
|
10
|
+
gem 'spider_bot'
|
11
|
+
```
|
12
|
+
|
13
|
+
并执行:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
或者直接通过命令安装:
|
18
|
+
|
19
|
+
$ gem install spider_bot
|
20
|
+
|
21
|
+
## SpiderBot 文件
|
22
|
+
|
23
|
+
#### 文件格式
|
24
|
+
|
25
|
+
1.单站单页爬取, 返回html文本
|
26
|
+
|
27
|
+
```
|
28
|
+
SpiderBot.crawl("http://example.com", #{origin_options})
|
29
|
+
```
|
30
|
+
|
31
|
+
2.单站多页爬取
|
32
|
+
|
33
|
+
```
|
34
|
+
SpiderBot.crawl("#{url}", data: Proce.new{ |data| data }, since: Proce.new{ |data| data }) do
|
35
|
+
|
36
|
+
paginate do
|
37
|
+
option :type, :json
|
38
|
+
option :path, '#{path}'
|
39
|
+
|
40
|
+
# 翻页页码设置
|
41
|
+
option :start, 0
|
42
|
+
option :add, 10
|
43
|
+
option :expire, 100 #如果设置为-1,将默认进行无限次爬取
|
44
|
+
option :sleep, 6
|
45
|
+
|
46
|
+
# 翻页后获取信息设置
|
47
|
+
option :data, Proc.new{ |data| data }
|
48
|
+
option :since, Proc.new{ |since| since }
|
49
|
+
|
50
|
+
option query, {page: "%{page}", since_id: %{since}}
|
51
|
+
end
|
52
|
+
|
53
|
+
crawl_data do |data|
|
54
|
+
# 解析爬取的数据...
|
55
|
+
end
|
56
|
+
end
|
57
|
+
```
|
58
|
+
|
59
|
+
3.多站,多页内容爬取, 可以配合Rails或者padrino进行任务爬去
|
60
|
+
|
61
|
+
```
|
62
|
+
class Mybot < SpiderBot::Base
|
63
|
+
|
64
|
+
#通过 "spider start” 或者 “spider crawl" 自动执行的方法
|
65
|
+
|
66
|
+
auto do
|
67
|
+
origin "#{url}", data: Proc.new{ |data| data }, since: Proce.new{ |since| since }
|
68
|
+
execute do
|
69
|
+
|
70
|
+
paginate do
|
71
|
+
option :type, :json
|
72
|
+
option :path, '#{path}'
|
73
|
+
|
74
|
+
# 翻页页码设置
|
75
|
+
option :start, 0
|
76
|
+
option :add, 10
|
77
|
+
option :expire, 100
|
78
|
+
option :sleep, 6
|
79
|
+
|
80
|
+
# 翻页后获取信息设置
|
81
|
+
option :data, Proc.new{ |data| data }
|
82
|
+
option :since, Proc.new{ |since| since }
|
83
|
+
|
84
|
+
option query, { page: "%{page}", since_id: "%{since}" }
|
85
|
+
end
|
86
|
+
|
87
|
+
crawl_data do |data|
|
88
|
+
# 解析爬取的数据...
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
```
|
94
|
+
|
95
|
+
####初始页面参数设置 origin_options
|
96
|
+
|
97
|
+
* path
|
98
|
+
* type
|
99
|
+
* headers
|
100
|
+
* query
|
101
|
+
* data 获取初始页面数据
|
102
|
+
* since 获取初始页码数据最后一条参数,用户翻页
|
103
|
+
|
104
|
+
####翻页参数设置
|
105
|
+
|
106
|
+
1.翻页后文本设置
|
107
|
+
|
108
|
+
* paginate_type 翻页后类型[:html, :json, :xml]
|
109
|
+
* paginate_path 翻页后的Path
|
110
|
+
* paginate_query 翻页后的参数设置 {page: "%{page}", since: "%{since}"}
|
111
|
+
|
112
|
+
|
113
|
+
2.翻页设置
|
114
|
+
|
115
|
+
* paginate_start #翻页起始页, 默认为0
|
116
|
+
* paginate_add #翻页增加数, 默认为 1
|
117
|
+
* paginate_expire #翻页总结数, 默认为30
|
118
|
+
* paginate_sleep #翻页休息数, 默认为 0
|
119
|
+
|
120
|
+
3.翻页信息获取
|
121
|
+
|
122
|
+
* paginate_data 获取翻页后的数据, 不填写,默认为origin data
|
123
|
+
* paginate_since 获取翻页后最后数据, 不填写, 默认为 origin_since
|
124
|
+
|
125
|
+
|
126
|
+
## SpiderBot 命令
|
127
|
+
|
128
|
+
* spider url #直接通过命令爬取, 返回html文本
|
129
|
+
- -q query, 设置Query
|
130
|
+
- -d data, 爬取数据
|
131
|
+
- -o out,输出到文件
|
132
|
+
|
133
|
+
* spider crawl #运行bot文件
|
134
|
+
- -b bot, 运行单一bot文件
|
135
|
+
- -d dir, 运行指定目录里的bot文件
|
136
|
+
- -p expire_page, 总翻页数(用以替代 option :export)
|
137
|
+
|
138
|
+
* spider start #运行爬取服务
|
139
|
+
- -d daemon, 后台运行
|
140
|
+
- -t time, #设置爬取时间间隔, 默认为10
|
141
|
+
- -r random #将爬取时间间隔, 设置为时间下一个随机数, 默认为10的随机数
|
142
|
+
- -e env #设置Sipder运行环境, 如果配合Rails或者Padrino, 获取指定运行环境
|
143
|
+
- -p expire_page, 总翻页数(用以替代 option :export)
|
144
|
+
|
145
|
+
* spider stop #停止爬取服务
|
146
|
+
|
147
|
+
|
148
|
+
|
data/Rakefile
ADDED
data/bin/spider
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module SpiderBot
|
2
|
+
class Base
|
3
|
+
class << self
|
4
|
+
#
|
5
|
+
# execute method with command "spider start" and "spider crawl"
|
6
|
+
#
|
7
|
+
|
8
|
+
def auto &block
|
9
|
+
if defined?(BOTCONSOLE)
|
10
|
+
klass = Class.new do
|
11
|
+
def origin url, options = {}
|
12
|
+
@origin_url = url
|
13
|
+
@origin_options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def execute name = nil, &block
|
17
|
+
crawl_instance = Crawl.new(@origin_url, @origin_options)
|
18
|
+
crawl_instance.instance_eval &block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
klass.allocate.instance_eval &block
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def crawl url, options = {}
|
26
|
+
crawl_instance = Crawl.new(url, options)
|
27
|
+
crawl_instance.crawl_data
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.expand_path('../..', __FILE__))
|
4
|
+
require "thor"
|
5
|
+
require 'spider_bot'
|
6
|
+
require 'daemons'
|
7
|
+
|
8
|
+
BOTCONSOLE = true
|
9
|
+
|
10
|
+
module SpiderBot
|
11
|
+
class CLI < Thor
|
12
|
+
desc "url", "Crawl url"
|
13
|
+
|
14
|
+
method_option :query,
|
15
|
+
aliases: "-q",
|
16
|
+
desc: "Set url query"
|
17
|
+
|
18
|
+
method_option :data,
|
19
|
+
aliases: "-d",
|
20
|
+
desc: "Match html data"
|
21
|
+
|
22
|
+
method_option :out,
|
23
|
+
aliases: ["-o"],
|
24
|
+
desc: "Write to file"
|
25
|
+
|
26
|
+
def url(arg)
|
27
|
+
data = Crawl.new(arg, options).crawl_data
|
28
|
+
return File.open(options[:out], "w"){ file.puts data } if options[:out]
|
29
|
+
return puts data
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "crawl", "Run spider bot file"
|
33
|
+
|
34
|
+
method_option :bot,
|
35
|
+
aliases: ["-b"],
|
36
|
+
desc: "Read bot flle"
|
37
|
+
|
38
|
+
method_option :dir,
|
39
|
+
aliases: ["-d"],
|
40
|
+
desc: "Read bot directory"
|
41
|
+
|
42
|
+
method_option :expire,
|
43
|
+
aliases: ["-p"],
|
44
|
+
desc: "Read data expired number"
|
45
|
+
|
46
|
+
def crawl
|
47
|
+
$expire_num = options[:expire].to_i if options[:expire]
|
48
|
+
require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
|
49
|
+
|
50
|
+
if options[:bot]
|
51
|
+
bot_file = File.expand_path(options[:bot])
|
52
|
+
return raise "Bot file not found" if !File.exists?(bot_file)
|
53
|
+
load bot_file
|
54
|
+
end
|
55
|
+
|
56
|
+
if options[:dir]
|
57
|
+
bot_dir = File.expand_path(options[:dir])
|
58
|
+
return raise "Dir is not found" if !Dir.exists?(bot_dir)
|
59
|
+
|
60
|
+
threads = []
|
61
|
+
Dir.glob("#{bot_dir}/*_bot.rb").each do |file|
|
62
|
+
threads << Thread.new do
|
63
|
+
begin
|
64
|
+
SpiderBot.logger.info "loading bot file with #{file}."
|
65
|
+
load file
|
66
|
+
rescue Exception => e
|
67
|
+
SpiderBot.logger.error "has errors with loading bot file #{ file }"
|
68
|
+
SpiderBot.logger.error e.to_s
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
threads.each { |t| t.join }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
desc "start", "Run spider bot service"
|
78
|
+
|
79
|
+
method_option :daemon,
|
80
|
+
aliases: ["-d"],
|
81
|
+
desc: "Run spider bot service in background"
|
82
|
+
|
83
|
+
method_option :time,
|
84
|
+
aliases: ["-t"],
|
85
|
+
desc: "Set crawl interval"
|
86
|
+
|
87
|
+
method_option :random,
|
88
|
+
aliases: ["-r"],
|
89
|
+
desc: "Set crawl interval to random "
|
90
|
+
|
91
|
+
method_option :env,
|
92
|
+
aliases: ["-e"],
|
93
|
+
desc: "set spider service environment"
|
94
|
+
|
95
|
+
method_option :expire,
|
96
|
+
aliases: ["-p"],
|
97
|
+
desc: "Read data expired page_number"
|
98
|
+
|
99
|
+
def start
|
100
|
+
puts "start....."
|
101
|
+
|
102
|
+
$expire_num = options[:expire].to_i if options[:expire]
|
103
|
+
|
104
|
+
if options[:env]
|
105
|
+
ENV['RACK_ENV'] = options[:env]
|
106
|
+
else
|
107
|
+
ENV['RACK_ENV']= 'development'
|
108
|
+
end
|
109
|
+
|
110
|
+
require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
|
111
|
+
|
112
|
+
FileUtils.mkdir_p("tmp/pids") if !File.exists?("tmp/pids")
|
113
|
+
|
114
|
+
daemon_options = {
|
115
|
+
app_name: 'spider',
|
116
|
+
ontop: true,
|
117
|
+
dir: 'tmp/pids',
|
118
|
+
}
|
119
|
+
|
120
|
+
sleep_time = 10
|
121
|
+
|
122
|
+
if options[:daemon]
|
123
|
+
daemon_options[:ontop] = false
|
124
|
+
else
|
125
|
+
puts "press ctrl-c exit"
|
126
|
+
end
|
127
|
+
|
128
|
+
stop if File.exists?("tmp/spider.pid")
|
129
|
+
|
130
|
+
if option_time = options[:time]
|
131
|
+
parse_time = option_time.match(/[d|h|m]/)
|
132
|
+
sleep_time = if parse_time
|
133
|
+
case parse_time[0]
|
134
|
+
when "d"
|
135
|
+
option_time.to_i * 60 * 60 * 24
|
136
|
+
when "h"
|
137
|
+
option_time.to_i * 60 * 60
|
138
|
+
when "m"
|
139
|
+
option_time.to_i * 60
|
140
|
+
end
|
141
|
+
else
|
142
|
+
option_time.to_i
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
Daemons.daemonize(daemon_options)
|
147
|
+
|
148
|
+
loop do
|
149
|
+
threads = []
|
150
|
+
|
151
|
+
BOTDIR.each do |file|
|
152
|
+
threads << Thread.new do
|
153
|
+
begin
|
154
|
+
SpiderBot.logger.info "loading bot file with #{file}."
|
155
|
+
load file
|
156
|
+
rescue Exception => e
|
157
|
+
SpiderBot.logger.error "has errors with loading bot file #{ file }"
|
158
|
+
SpiderBot.logger.error e.to_s
|
159
|
+
end
|
160
|
+
sleep(10)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
threads.each { |t| t.join }
|
165
|
+
|
166
|
+
if options[:random]
|
167
|
+
random_time = Random.new.rand(sleep_time)
|
168
|
+
sleep(random_time.to_i)
|
169
|
+
else
|
170
|
+
sleep(sleep_time.to_i)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
desc 'stop', "Stop spider bot service"
|
176
|
+
|
177
|
+
def stop
|
178
|
+
pid = File.read("tmp/pids/spider.pid").to_i
|
179
|
+
Process.kill(9, pid)
|
180
|
+
File.delete("tmp/pids/spider.pid")
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,235 @@
|
|
1
|
+
module SpiderBot
|
2
|
+
class Crawl
|
3
|
+
|
4
|
+
# Initialize a new Spider Bot
|
5
|
+
#
|
6
|
+
# @param url [String] the spider target website curl
|
7
|
+
# @param options [Hash] the spider crawl configurate options
|
8
|
+
# @option options :type [Symbol] the request body format, `:html` or `:json`
|
9
|
+
# @option options :headers [Hash] the custom request headers
|
10
|
+
# @option options :path, [String] the custom request path
|
11
|
+
# @option options :query [Hash] the request query
|
12
|
+
# @option options :user_agent [String] the custom request user agent
|
13
|
+
# @option options :source [Boolean]
|
14
|
+
# @option options :data [Proc] get crawl data list in body
|
15
|
+
# @option options :first [Proc] get crawl data list first item
|
16
|
+
# @option options :last [Porc] get crawl data list last item
|
17
|
+
# @option options :encode [String] custom request encode
|
18
|
+
|
19
|
+
def initialize(url, options = {})
|
20
|
+
parse_uri = URI.parse url
|
21
|
+
@uri = parse_uri.scheme + "://" + parse_uri.host
|
22
|
+
|
23
|
+
# don't add 443 port append to url when access https website
|
24
|
+
if !["80", "443"].include?(parse_uri.port.to_s)
|
25
|
+
@uri = @uri + ":" + parse_uri.port.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
@origin_path = parse_uri.path || "/"
|
29
|
+
|
30
|
+
@origin_type = options[:type] || :html
|
31
|
+
@origin_headers = options[:headers] || {}
|
32
|
+
@origin_query = options[:query] || {}
|
33
|
+
|
34
|
+
@origin_user_agent = options[:user_agent] || "Mac Safari"
|
35
|
+
@origin_source = options[:source] || false
|
36
|
+
|
37
|
+
@origin_data = options[:data]
|
38
|
+
@origin_first = options[:first]
|
39
|
+
@origin_last = options[:last]
|
40
|
+
|
41
|
+
@origin_encode = options[:encode]
|
42
|
+
|
43
|
+
@page_path = @origin_path
|
44
|
+
@page_type = @origin_type
|
45
|
+
@page_headers = @origin_headers || {}
|
46
|
+
@page_query = {}
|
47
|
+
|
48
|
+
@page_data = @origin_data
|
49
|
+
@page_first = @origin_first
|
50
|
+
@page_last = @origin_last
|
51
|
+
|
52
|
+
@page_start = 1
|
53
|
+
@page_add = 1
|
54
|
+
@page_expire = 10
|
55
|
+
@page_sleep = 0
|
56
|
+
|
57
|
+
@paginate_last = nil
|
58
|
+
@paginate_error = 0
|
59
|
+
@paginate_type = :html
|
60
|
+
@paginate_path = ""
|
61
|
+
@paginate_query = {}
|
62
|
+
|
63
|
+
@connection = Http::Client.new do |http|
|
64
|
+
http.url= @uri
|
65
|
+
http.user_agent= @origin_user_agent
|
66
|
+
http.headers= @origin_headers
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Process crawl data
|
71
|
+
#
|
72
|
+
# @param a [block]
|
73
|
+
|
74
|
+
def crawl_data(&block)
|
75
|
+
@paginate_num = @page_start
|
76
|
+
|
77
|
+
catch :all do
|
78
|
+
begin
|
79
|
+
crawl_response = crawl_request(@origin_path, @origin_query, @origin_type, @origin_data, @origin_first, @origin_last, &block)
|
80
|
+
return crawl_response if !block_given?
|
81
|
+
process_response(crawl_response, &block)
|
82
|
+
rescue Exception => e
|
83
|
+
handle_error(e)
|
84
|
+
crawl_data(&block)
|
85
|
+
end
|
86
|
+
|
87
|
+
@paginate_error = 0
|
88
|
+
return if @page_query.blank? && @page_path == @origin_path
|
89
|
+
|
90
|
+
crawl_paginate(&block)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def crawl_paginate(&block)
|
97
|
+
@page_headers.merge({"X-Requested-With" => "XMLHttpRequest"}) if @page_type.to_s == 'json'
|
98
|
+
@connection.headers = @page_headers
|
99
|
+
begin
|
100
|
+
loop do
|
101
|
+
real_page_num = (@page_start == 0 && @page_add > 1) ? (@paginate_num / @page_add) + 1 : @paginate_num
|
102
|
+
if defined?($expire_num)
|
103
|
+
if $expire_num > 1
|
104
|
+
break if real_page_num > $expire_num.to_i
|
105
|
+
else
|
106
|
+
break if real_page_num > 1
|
107
|
+
end
|
108
|
+
end
|
109
|
+
# break crawl_paginate current page number more than @page_expire and @page_expre
|
110
|
+
if real_page_num > @page_expire && @page_expire != -1
|
111
|
+
SpiderBot.logger.info "Crawl finished..."
|
112
|
+
SpiderBot.logger.info "Finish reson: The current page more than setting paginate expire"
|
113
|
+
break
|
114
|
+
end
|
115
|
+
|
116
|
+
sleep(@page_sleep) if @page_sleep > 0
|
117
|
+
|
118
|
+
path = @page_path.to_s % {page: @paginate_num}
|
119
|
+
query_str = @page_query.to_s % { page: @paginate_num, last: @paginate_last, first: @paginate_first }
|
120
|
+
query = eval(query_str)
|
121
|
+
|
122
|
+
crawl_response = crawl_request(path, query, @page_type, @page_data, @page_first, @page_last, &block)
|
123
|
+
process_response(crawl_response, &block)
|
124
|
+
end
|
125
|
+
rescue Exception => e
|
126
|
+
@paginate_num += @page_add if @paginate_error == 2
|
127
|
+
handle_error(e)
|
128
|
+
crawl_paginate(&block)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def crawl_request(path, query, type, data, first, last, &block)
|
133
|
+
@paginate_path = path
|
134
|
+
@paginate_query = query
|
135
|
+
|
136
|
+
response = @connection.get(path, query)
|
137
|
+
|
138
|
+
return if !response
|
139
|
+
return if response.status != 200
|
140
|
+
|
141
|
+
options = { encode: @origin_encode } if @origin_encode
|
142
|
+
|
143
|
+
if @origin_source && !block_given?
|
144
|
+
return response.body(options)
|
145
|
+
end
|
146
|
+
|
147
|
+
if type.to_s == "html"
|
148
|
+
@paginate_type = :html
|
149
|
+
body = Nokogiri::HTML response.body(options)
|
150
|
+
elsif type.to_s == "json"
|
151
|
+
@paginate_type = :json
|
152
|
+
body = MultiJson.load response.body(options)
|
153
|
+
else
|
154
|
+
@paginate_type = response.parser
|
155
|
+
body = response.parsed
|
156
|
+
end
|
157
|
+
|
158
|
+
return if body.nil?
|
159
|
+
return body if data.nil?
|
160
|
+
|
161
|
+
body_data = data.call(body) if data
|
162
|
+
@paginate_first = first.call(body_data, body) if first
|
163
|
+
@paginate_last = last.call(body_data, body) if last
|
164
|
+
|
165
|
+
return body_data
|
166
|
+
end
|
167
|
+
|
168
|
+
def get_page_url
|
169
|
+
if !@paginate_query.blank?
|
170
|
+
@uri + @paginate_path + "?" + @paginate_query.map{ |k,v| "#{k}=#{v}" }.join("&")
|
171
|
+
else
|
172
|
+
@uri + @paginate_path
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def set_paginate_headers(arg)
|
177
|
+
@page_headers = arg || {}
|
178
|
+
end
|
179
|
+
|
180
|
+
# set crawl paginate settings
|
181
|
+
#
|
182
|
+
# @example
|
183
|
+
# paginate do
|
184
|
+
# option :path, '/path'
|
185
|
+
# option :query, {page: "%{page}"}
|
186
|
+
# option :first, Proc.new{|data| data.css("#item")}
|
187
|
+
# option :last, Proc.new{|data| data.css("#item")}
|
188
|
+
# option :type, :html
|
189
|
+
# option :data, Proc.new{|body| body.css("#item")}
|
190
|
+
# option :start, 1
|
191
|
+
# option :add, 1
|
192
|
+
# option :expire, 100
|
193
|
+
# option :sleep, 100
|
194
|
+
# end
|
195
|
+
|
196
|
+
def paginate(&block)
|
197
|
+
block.call
|
198
|
+
end
|
199
|
+
|
200
|
+
def option(name, params)
|
201
|
+
raise "set paginate options has error" if %i(path query first last type data start add expire sleep).include?(name.to_s)
|
202
|
+
eval("@page_#{name} = params")
|
203
|
+
end
|
204
|
+
|
205
|
+
def break_all
|
206
|
+
throw :all
|
207
|
+
end
|
208
|
+
|
209
|
+
def handle_error(error)
|
210
|
+
SpiderBot.logger.error "crawling url #{ get_page_url } has error..."
|
211
|
+
SpiderBot.logger.error error.to_s
|
212
|
+
|
213
|
+
break_all if @paginate_error == 3
|
214
|
+
@paginate_error += 1
|
215
|
+
|
216
|
+
sleep( 60 * @paginate_error )
|
217
|
+
end
|
218
|
+
|
219
|
+
# Print error infomation with http client response blank
|
220
|
+
#
|
221
|
+
# @param response [Object] The Faraday connection builder
|
222
|
+
|
223
|
+
def process_response(response, &block)
|
224
|
+
if response.blank?
|
225
|
+
SpiderBot.logger.info "Crawl finished..."
|
226
|
+
SpiderBot.logger.info "Finish reson: Crawl response body is blank..."
|
227
|
+
break_all
|
228
|
+
end
|
229
|
+
SpiderBot.logger.info "crawling page for #{get_page_url}"
|
230
|
+
yield response, @paginate_num, @paginate_type
|
231
|
+
@paginate_num += @page_add
|
232
|
+
@paginate_error = 0
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|