list_spider 2.5.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -1
- data/lib/list_spider.rb +14 -4
- data/lib/list_spider/version.rb +1 -1
- data/spider_not_save_file.rb +34 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e51274f347602a7a0ef0cc3b8a7f0b809f08168f74056a18f8c61d9583644c8f
|
4
|
+
data.tar.gz: adefb63c47d4084ffb50790716c42e839e0b0652985f3b9bfdaafb0ce0fd98b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cbff095a5255a8b8b9c9fa23d1c17022e31b84a3bcb4b13f2579aebcb2f968d07a543adb9fb2cd19e082539a08845bdae26b675ead2464d5f45ae859d61f7379
|
7
|
+
data.tar.gz: 141c006a26e5740cfa697bf2f2a05ded10c57576556f4b67dbb6ed437a57dabdfc0d0b1ffb5b8153bd96717cafecf1181d8dfb4af4cb88ce8a76aad8dc5e2f34
|
data/README.md
CHANGED
@@ -100,11 +100,38 @@ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
|
100
100
|
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
101
|
```
|
102
102
|
|
103
|
+
## 可以设置为不保存文件,用`ListSpider.save_file = false`,此时不会根据文件路径做去重处理
|
104
|
+
```
|
105
|
+
require 'list_spider'
|
106
|
+
|
107
|
+
def call_back(task_struct, http_req)
|
108
|
+
puts "succeed"
|
109
|
+
puts http_req.response_header.status
|
110
|
+
File.write("index.html", http_req.response)
|
111
|
+
end
|
112
|
+
|
113
|
+
def err_back(task_struct, http_req)
|
114
|
+
puts "failed"
|
115
|
+
puts http_req.response_header.status
|
116
|
+
end
|
117
|
+
|
118
|
+
ListSpider.save_file = false
|
119
|
+
|
120
|
+
# get_one is a simple function for one taskstruct situation
|
121
|
+
ListSpider.get_one(
|
122
|
+
TaskStruct.new(
|
123
|
+
'https://coolshell.cn/',
|
124
|
+
callback: method(:call_back),
|
125
|
+
errback: method(:err_back)
|
126
|
+
)
|
127
|
+
)
|
128
|
+
```
|
129
|
+
|
103
130
|
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
131
|
|
105
132
|
```ruby
|
106
133
|
new(href, # 请求链接
|
107
|
-
local_path, #
|
134
|
+
local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
108
135
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
136
|
http_method: :get,
|
110
137
|
custom_data: nil, # 自定义数据
|
@@ -167,6 +194,8 @@ end
|
|
167
194
|
def call_back(task_struct, http_req)
|
168
195
|
# http_req 是EventMachine::HttpRequest对象
|
169
196
|
# http_req.response_header.status
|
197
|
+
# http_req.response_header
|
198
|
+
# http_req.response
|
170
199
|
# ...
|
171
200
|
end
|
172
201
|
|
data/lib/list_spider.rb
CHANGED
@@ -3,6 +3,7 @@ require 'em-http-request'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'fileutils'
|
5
5
|
require 'set'
|
6
|
+
require 'securerandom'
|
6
7
|
require 'addressable/uri'
|
7
8
|
require File.expand_path('spider_helper', __dir__)
|
8
9
|
require File.expand_path('file_filter', __dir__)
|
@@ -10,12 +11,12 @@ require File.expand_path('file_filter', __dir__)
|
|
10
11
|
# 爬取任务类
|
11
12
|
class TaskStruct
|
12
13
|
# * href 请求链接
|
13
|
-
# * local_path
|
14
|
+
# * local_path 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
14
15
|
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
16
|
# * custom_data 自定义数据
|
16
17
|
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
18
|
def initialize(href, # 请求链接
|
18
|
-
local_path, #
|
19
|
+
local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
|
19
20
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
21
|
http_method: :get,
|
21
22
|
custom_data: nil, # 自定义数据
|
@@ -117,8 +118,11 @@ module ListSpider
|
|
117
118
|
@random_time_range = 3..10
|
118
119
|
@local_path_set = Set.new
|
119
120
|
@down_list = []
|
121
|
+
@save_file = true
|
120
122
|
|
121
123
|
class << self
|
124
|
+
attr_accessor :save_file
|
125
|
+
|
122
126
|
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
123
127
|
if interval.is_a? Range
|
124
128
|
@random_time_range = interval
|
@@ -170,7 +174,7 @@ module ListSpider
|
|
170
174
|
s = http_req.response_header.status
|
171
175
|
puts "#{Time.now}, http status code: #{s}"
|
172
176
|
|
173
|
-
if s == 200
|
177
|
+
if s == 200 && @save_file
|
174
178
|
local_dir = File.dirname(task_struct.local_path)
|
175
179
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
176
180
|
begin
|
@@ -197,7 +201,11 @@ module ListSpider
|
|
197
201
|
end
|
198
202
|
|
199
203
|
begin
|
200
|
-
|
204
|
+
if @save_file
|
205
|
+
multi.add task_struct.local_path, http_req
|
206
|
+
else
|
207
|
+
multi.add SecureRandom.uuid, http_req
|
208
|
+
end
|
201
209
|
rescue StandardError => exception
|
202
210
|
puts exception
|
203
211
|
puts task_struct.href
|
@@ -279,6 +287,8 @@ module ListSpider
|
|
279
287
|
end
|
280
288
|
|
281
289
|
def filter_list(down_list)
|
290
|
+
return down_list unless @save_file
|
291
|
+
|
282
292
|
need_down_list = []
|
283
293
|
down_list.each do |ts|
|
284
294
|
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
data/lib/list_spider/version.rb
CHANGED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
+
|
4
|
+
def call_back(task_struct, http_req)
|
5
|
+
puts "succeed"
|
6
|
+
puts http_req.response_header.status
|
7
|
+
content = http_req.response
|
8
|
+
doc = Nokogiri::HTML(content)
|
9
|
+
list_group = doc.css('h2.entry-title')
|
10
|
+
link_list = list_group.css('a')
|
11
|
+
|
12
|
+
link_list.each do |link|
|
13
|
+
href = link['href']
|
14
|
+
ListSpider.add_task(TaskStruct.new(href,
|
15
|
+
callback: method(:call_back),
|
16
|
+
errback: method(:err_back)))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def err_back(task_struct, http_req)
|
21
|
+
puts "failed"
|
22
|
+
puts http_req.response_header.status
|
23
|
+
end
|
24
|
+
|
25
|
+
ListSpider.save_file = false
|
26
|
+
|
27
|
+
# get_one is a simple function for one taskstruct situation
|
28
|
+
ListSpider.get_one(
|
29
|
+
TaskStruct.new(
|
30
|
+
'https://coolshell.cn/',
|
31
|
+
callback: method(:call_back),
|
32
|
+
errback: method(:err_back)
|
33
|
+
)
|
34
|
+
)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-04-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -117,6 +117,7 @@ files:
|
|
117
117
|
- list_spider.gemspec
|
118
118
|
- spider_example.rb
|
119
119
|
- spider_example_2.rb
|
120
|
+
- spider_not_save_file.rb
|
120
121
|
homepage: https://github.com/chinazhangchao/list_spider
|
121
122
|
licenses:
|
122
123
|
- MIT
|