list_spider 2.5.0 → 2.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +30 -1
- data/lib/list_spider.rb +14 -4
- data/lib/list_spider/version.rb +1 -1
- data/spider_not_save_file.rb +34 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e51274f347602a7a0ef0cc3b8a7f0b809f08168f74056a18f8c61d9583644c8f
|
4
|
+
data.tar.gz: adefb63c47d4084ffb50790716c42e839e0b0652985f3b9bfdaafb0ce0fd98b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cbff095a5255a8b8b9c9fa23d1c17022e31b84a3bcb4b13f2579aebcb2f968d07a543adb9fb2cd19e082539a08845bdae26b675ead2464d5f45ae859d61f7379
|
7
|
+
data.tar.gz: 141c006a26e5740cfa697bf2f2a05ded10c57576556f4b67dbb6ed437a57dabdfc0d0b1ffb5b8153bd96717cafecf1181d8dfb4af4cb88ce8a76aad8dc5e2f34
|
data/README.md
CHANGED
@@ -100,11 +100,38 @@ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
|
100
100
|
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
101
|
```
|
102
102
|
|
103
|
+
## 可以设置为不保存文件,用`ListSpider.save_file = false`,此时不会根据文件路径做去重处理
|
104
|
+
```
|
105
|
+
require 'list_spider'
|
106
|
+
|
107
|
+
def call_back(task_struct, http_req)
|
108
|
+
puts "succeed"
|
109
|
+
puts http_req.response_header.status
|
110
|
+
File.write("index.html", http_req.response)
|
111
|
+
end
|
112
|
+
|
113
|
+
def err_back(task_struct, http_req)
|
114
|
+
puts "failed"
|
115
|
+
puts http_req.response_header.status
|
116
|
+
end
|
117
|
+
|
118
|
+
ListSpider.save_file = false
|
119
|
+
|
120
|
+
# get_one is a simple function for one taskstruct situation
|
121
|
+
ListSpider.get_one(
|
122
|
+
TaskStruct.new(
|
123
|
+
'https://coolshell.cn/',
|
124
|
+
callback: method(:call_back),
|
125
|
+
errback: method(:err_back)
|
126
|
+
)
|
127
|
+
)
|
128
|
+
```
|
129
|
+
|
103
130
|
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
131
|
|
105
132
|
```ruby
|
106
133
|
new(href, # 请求链接
|
107
|
-
local_path, #
|
134
|
+
local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
108
135
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
136
|
http_method: :get,
|
110
137
|
custom_data: nil, # 自定义数据
|
@@ -167,6 +194,8 @@ end
|
|
167
194
|
def call_back(task_struct, http_req)
|
168
195
|
# http_req 是EventMachine::HttpRequest对象
|
169
196
|
# http_req.response_header.status
|
197
|
+
# http_req.response_header
|
198
|
+
# http_req.response
|
170
199
|
# ...
|
171
200
|
end
|
172
201
|
|
data/lib/list_spider.rb
CHANGED
@@ -3,6 +3,7 @@ require 'em-http-request'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'fileutils'
|
5
5
|
require 'set'
|
6
|
+
require 'securerandom'
|
6
7
|
require 'addressable/uri'
|
7
8
|
require File.expand_path('spider_helper', __dir__)
|
8
9
|
require File.expand_path('file_filter', __dir__)
|
@@ -10,12 +11,12 @@ require File.expand_path('file_filter', __dir__)
|
|
10
11
|
# 爬取任务类
|
11
12
|
class TaskStruct
|
12
13
|
# * href 请求链接
|
13
|
-
# * local_path
|
14
|
+
# * local_path 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
14
15
|
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
16
|
# * custom_data 自定义数据
|
16
17
|
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
18
|
def initialize(href, # 请求链接
|
18
|
-
local_path, #
|
19
|
+
local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
19
20
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
21
|
http_method: :get,
|
21
22
|
custom_data: nil, # 自定义数据
|
@@ -117,8 +118,11 @@ module ListSpider
|
|
117
118
|
@random_time_range = 3..10
|
118
119
|
@local_path_set = Set.new
|
119
120
|
@down_list = []
|
121
|
+
@save_file = true
|
120
122
|
|
121
123
|
class << self
|
124
|
+
attr_accessor :save_file
|
125
|
+
|
122
126
|
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
123
127
|
if interval.is_a? Range
|
124
128
|
@random_time_range = interval
|
@@ -170,7 +174,7 @@ module ListSpider
|
|
170
174
|
s = http_req.response_header.status
|
171
175
|
puts "#{Time.now}, http status code: #{s}"
|
172
176
|
|
173
|
-
if s == 200
|
177
|
+
if s == 200 && @save_file
|
174
178
|
local_dir = File.dirname(task_struct.local_path)
|
175
179
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
176
180
|
begin
|
@@ -197,7 +201,11 @@ module ListSpider
|
|
197
201
|
end
|
198
202
|
|
199
203
|
begin
|
200
|
-
|
204
|
+
if @save_file
|
205
|
+
multi.add task_struct.local_path, http_req
|
206
|
+
else
|
207
|
+
multi.add SecureRandom.uuid, http_req
|
208
|
+
end
|
201
209
|
rescue StandardError => exception
|
202
210
|
puts exception
|
203
211
|
puts task_struct.href
|
@@ -279,6 +287,8 @@ module ListSpider
|
|
279
287
|
end
|
280
288
|
|
281
289
|
def filter_list(down_list)
|
290
|
+
return down_list unless @save_file
|
291
|
+
|
282
292
|
need_down_list = []
|
283
293
|
down_list.each do |ts|
|
284
294
|
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
data/lib/list_spider/version.rb
CHANGED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
+
|
4
|
+
def call_back(task_struct, http_req)
|
5
|
+
puts "succeed"
|
6
|
+
puts http_req.response_header.status
|
7
|
+
content = http_req.response
|
8
|
+
doc = Nokogiri::HTML(content)
|
9
|
+
list_group = doc.css('h2.entry-title')
|
10
|
+
link_list = list_group.css('a')
|
11
|
+
|
12
|
+
link_list.each do |link|
|
13
|
+
href = link['href']
|
14
|
+
ListSpider.add_task(TaskStruct.new(href,
|
15
|
+
callback: method(:call_back),
|
16
|
+
errback: method(:err_back)))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def err_back(task_struct, http_req)
|
21
|
+
puts "failed"
|
22
|
+
puts http_req.response_header.status
|
23
|
+
end
|
24
|
+
|
25
|
+
ListSpider.save_file = false
|
26
|
+
|
27
|
+
# get_one is a simple function for one taskstruct situation
|
28
|
+
ListSpider.get_one(
|
29
|
+
TaskStruct.new(
|
30
|
+
'https://coolshell.cn/',
|
31
|
+
callback: method(:call_back),
|
32
|
+
errback: method(:err_back)
|
33
|
+
)
|
34
|
+
)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-04-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -117,6 +117,7 @@ files:
|
|
117
117
|
- list_spider.gemspec
|
118
118
|
- spider_example.rb
|
119
119
|
- spider_example_2.rb
|
120
|
+
- spider_not_save_file.rb
|
120
121
|
homepage: https://github.com/chinazhangchao/list_spider
|
121
122
|
licenses:
|
122
123
|
- MIT
|