list_spider 2.5.0 → 2.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2d58268cff218eb1a17602e842d6050a18fd87b5d975ad3b4a142320c8229d18
4
- data.tar.gz: 03a8c6918e05656fe381a17402fd05182cceaad4fbadfeeb5753233a250c1aee
3
+ metadata.gz: e51274f347602a7a0ef0cc3b8a7f0b809f08168f74056a18f8c61d9583644c8f
4
+ data.tar.gz: adefb63c47d4084ffb50790716c42e839e0b0652985f3b9bfdaafb0ce0fd98b6
5
5
  SHA512:
6
- metadata.gz: d12f5bb374abb8fcd0f727f2e2f78909c022e8ed43f720f8e913eae46192ba3cfa45dbd45faeb76f76c57e1725abda674e7282ca3ac56a35ed7ac6b76b47e541
7
- data.tar.gz: 324d9f632091ef9301a666e1ed7ffe963cb1647e7ceb33774324cd8c30afa5738305dfb508d1318fee3db15eba0547fc2e7a184d4ed027c7e7bc2340405ea31b
6
+ metadata.gz: cbff095a5255a8b8b9c9fa23d1c17022e31b84a3bcb4b13f2579aebcb2f968d07a543adb9fb2cd19e082539a08845bdae26b675ead2464d5f45ae859d61f7379
7
+ data.tar.gz: 141c006a26e5740cfa697bf2f2a05ded10c57576556f4b67dbb6ed437a57dabdfc0d0b1ffb5b8153bd96717cafecf1181d8dfb4af4cb88ce8a76aad8dc5e2f34
data/README.md CHANGED
@@ -100,11 +100,38 @@ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
100
  get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
101
101
  ```
102
102
 
103
+ ## 可以设置为不保存文件,用`ListSpider.save_file = false`,此时不会根据文件路径做去重处理
104
+ ```
105
+ require 'list_spider'
106
+
107
+ def call_back(task_struct, http_req)
108
+ puts "succeed"
109
+ puts http_req.response_header.status
110
+ File.write("index.html", http_req.response)
111
+ end
112
+
113
+ def err_back(task_struct, http_req)
114
+ puts "failed"
115
+ puts http_req.response_header.status
116
+ end
117
+
118
+ ListSpider.save_file = false
119
+
120
+ # get_one is a simple function for one taskstruct situation
121
+ ListSpider.get_one(
122
+ TaskStruct.new(
123
+ 'https://coolshell.cn/',
124
+ callback: method(:call_back),
125
+ errback: method(:err_back)
126
+ )
127
+ )
128
+ ```
129
+
103
130
  ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
104
131
 
105
132
  ```ruby
106
133
  new(href, # 请求链接
107
- local_path, # 保存数据的本地路径(此路径作为去重标准)
134
+ local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
108
135
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
136
  http_method: :get,
110
137
  custom_data: nil, # 自定义数据
@@ -167,6 +194,8 @@ end
167
194
  def call_back(task_struct, http_req)
168
195
  # http_req 是EventMachine::HttpRequest对象
169
196
  # http_req.response_header.status
197
+ # http_req.response_header
198
+ # http_req.response
170
199
  # ...
171
200
  end
172
201
 
@@ -3,6 +3,7 @@ require 'em-http-request'
3
3
  require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
+ require 'securerandom'
6
7
  require 'addressable/uri'
7
8
  require File.expand_path('spider_helper', __dir__)
8
9
  require File.expand_path('file_filter', __dir__)
@@ -10,12 +11,12 @@ require File.expand_path('file_filter', __dir__)
10
11
  # 爬取任务类
11
12
  class TaskStruct
12
13
  # * href 请求链接
13
- # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * local_path 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
14
15
  # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
16
  # * custom_data 自定义数据
16
17
  # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
18
  def initialize(href, # 请求链接
18
- local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
19
20
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
21
  http_method: :get,
21
22
  custom_data: nil, # 自定义数据
@@ -117,8 +118,11 @@ module ListSpider
117
118
  @random_time_range = 3..10
118
119
  @local_path_set = Set.new
119
120
  @down_list = []
121
+ @save_file = true
120
122
 
121
123
  class << self
124
+ attr_accessor :save_file
125
+
122
126
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
123
127
  if interval.is_a? Range
124
128
  @random_time_range = interval
@@ -170,7 +174,7 @@ module ListSpider
170
174
  s = http_req.response_header.status
171
175
  puts "#{Time.now}, http status code: #{s}"
172
176
 
173
- if s == 200
177
+ if s == 200 && @save_file
174
178
  local_dir = File.dirname(task_struct.local_path)
175
179
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
176
180
  begin
@@ -197,7 +201,11 @@ module ListSpider
197
201
  end
198
202
 
199
203
  begin
200
- multi.add task_struct.local_path, http_req
204
+ if @save_file
205
+ multi.add task_struct.local_path, http_req
206
+ else
207
+ multi.add SecureRandom.uuid, http_req
208
+ end
201
209
  rescue StandardError => exception
202
210
  puts exception
203
211
  puts task_struct.href
@@ -279,6 +287,8 @@ module ListSpider
279
287
  end
280
288
 
281
289
  def filter_list(down_list)
290
+ return down_list unless @save_file
291
+
282
292
  need_down_list = []
283
293
  down_list.each do |ts|
284
294
  if !ts.overwrite_exist && File.exist?(ts.local_path)
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '2.5.0'.freeze
2
+ VERSION = '2.6.0'.freeze
3
3
  end
@@ -0,0 +1,34 @@
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ def call_back(task_struct, http_req)
5
+ puts "succeed"
6
+ puts http_req.response_header.status
7
+ content = http_req.response
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ ListSpider.add_task(TaskStruct.new(href,
15
+ callback: method(:call_back),
16
+ errback: method(:err_back)))
17
+ end
18
+ end
19
+
20
+ def err_back(task_struct, http_req)
21
+ puts "failed"
22
+ puts http_req.response_header.status
23
+ end
24
+
25
+ ListSpider.save_file = false
26
+
27
+ # get_one is a simple function for one taskstruct situation
28
+ ListSpider.get_one(
29
+ TaskStruct.new(
30
+ 'https://coolshell.cn/',
31
+ callback: method(:call_back),
32
+ errback: method(:err_back)
33
+ )
34
+ )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.0
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-06 00:00:00.000000000 Z
11
+ date: 2020-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -117,6 +117,7 @@ files:
117
117
  - list_spider.gemspec
118
118
  - spider_example.rb
119
119
  - spider_example_2.rb
120
+ - spider_not_save_file.rb
120
121
  homepage: https://github.com/chinazhangchao/list_spider
121
122
  licenses:
122
123
  - MIT