list_spider 2.5.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2d58268cff218eb1a17602e842d6050a18fd87b5d975ad3b4a142320c8229d18
4
- data.tar.gz: 03a8c6918e05656fe381a17402fd05182cceaad4fbadfeeb5753233a250c1aee
3
+ metadata.gz: e51274f347602a7a0ef0cc3b8a7f0b809f08168f74056a18f8c61d9583644c8f
4
+ data.tar.gz: adefb63c47d4084ffb50790716c42e839e0b0652985f3b9bfdaafb0ce0fd98b6
5
5
  SHA512:
6
- metadata.gz: d12f5bb374abb8fcd0f727f2e2f78909c022e8ed43f720f8e913eae46192ba3cfa45dbd45faeb76f76c57e1725abda674e7282ca3ac56a35ed7ac6b76b47e541
7
- data.tar.gz: 324d9f632091ef9301a666e1ed7ffe963cb1647e7ceb33774324cd8c30afa5738305dfb508d1318fee3db15eba0547fc2e7a184d4ed027c7e7bc2340405ea31b
6
+ metadata.gz: cbff095a5255a8b8b9c9fa23d1c17022e31b84a3bcb4b13f2579aebcb2f968d07a543adb9fb2cd19e082539a08845bdae26b675ead2464d5f45ae859d61f7379
7
+ data.tar.gz: 141c006a26e5740cfa697bf2f2a05ded10c57576556f4b67dbb6ed437a57dabdfc0d0b1ffb5b8153bd96717cafecf1181d8dfb4af4cb88ce8a76aad8dc5e2f34
data/README.md CHANGED
@@ -100,11 +100,38 @@ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
100
  get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
101
101
  ```
102
102
 
103
+ ## 可以设置为不保存文件,用`ListSpider.save_file = false`,此时不会根据文件路径做去重处理
104
+ ```
105
+ require 'list_spider'
106
+
107
+ def call_back(task_struct, http_req)
108
+ puts "succeed"
109
+ puts http_req.response_header.status
110
+ File.write("index.html", http_req.response)
111
+ end
112
+
113
+ def err_back(task_struct, http_req)
114
+ puts "failed"
115
+ puts http_req.response_header.status
116
+ end
117
+
118
+ ListSpider.save_file = false
119
+
120
+ # get_one is a simple function for one taskstruct situation
121
+ ListSpider.get_one(
122
+ TaskStruct.new(
123
+ 'https://coolshell.cn/',
124
+ callback: method(:call_back),
125
+ errback: method(:err_back)
126
+ )
127
+ )
128
+ ```
129
+
103
130
  ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
104
131
 
105
132
  ```ruby
106
133
  new(href, # 请求链接
107
- local_path, # 保存数据的本地路径(此路径作为去重标准)
134
+ local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
108
135
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
136
  http_method: :get,
110
137
  custom_data: nil, # 自定义数据
@@ -167,6 +194,8 @@ end
167
194
  def call_back(task_struct, http_req)
168
195
  # http_req 是EventMachine::HttpRequest对象
169
196
  # http_req.response_header.status
197
+ # http_req.response_header
198
+ # http_req.response
170
199
  # ...
171
200
  end
172
201
 
@@ -3,6 +3,7 @@ require 'em-http-request'
3
3
  require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
+ require 'securerandom'
6
7
  require 'addressable/uri'
7
8
  require File.expand_path('spider_helper', __dir__)
8
9
  require File.expand_path('file_filter', __dir__)
@@ -10,12 +11,12 @@ require File.expand_path('file_filter', __dir__)
10
11
  # 爬取任务类
11
12
  class TaskStruct
12
13
  # * href 请求链接
13
- # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * local_path 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
14
15
  # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
16
  # * custom_data 自定义数据
16
17
  # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
18
  def initialize(href, # 请求链接
18
- local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
19
20
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
21
  http_method: :get,
21
22
  custom_data: nil, # 自定义数据
@@ -117,8 +118,11 @@ module ListSpider
117
118
  @random_time_range = 3..10
118
119
  @local_path_set = Set.new
119
120
  @down_list = []
121
+ @save_file = true
120
122
 
121
123
  class << self
124
+ attr_accessor :save_file
125
+
122
126
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
123
127
  if interval.is_a? Range
124
128
  @random_time_range = interval
@@ -170,7 +174,7 @@ module ListSpider
170
174
  s = http_req.response_header.status
171
175
  puts "#{Time.now}, http status code: #{s}"
172
176
 
173
- if s == 200
177
+ if s == 200 && @save_file
174
178
  local_dir = File.dirname(task_struct.local_path)
175
179
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
176
180
  begin
@@ -197,7 +201,11 @@ module ListSpider
197
201
  end
198
202
 
199
203
  begin
200
- multi.add task_struct.local_path, http_req
204
+ if @save_file
205
+ multi.add task_struct.local_path, http_req
206
+ else
207
+ multi.add SecureRandom.uuid, http_req
208
+ end
201
209
  rescue StandardError => exception
202
210
  puts exception
203
211
  puts task_struct.href
@@ -279,6 +287,8 @@ module ListSpider
279
287
  end
280
288
 
281
289
  def filter_list(down_list)
290
+ return down_list unless @save_file
291
+
282
292
  need_down_list = []
283
293
  down_list.each do |ts|
284
294
  if !ts.overwrite_exist && File.exist?(ts.local_path)
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '2.5.0'.freeze
2
+ VERSION = '2.6.0'.freeze
3
3
  end
@@ -0,0 +1,34 @@
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ def call_back(task_struct, http_req)
5
+ puts "succeed"
6
+ puts http_req.response_header.status
7
+ content = http_req.response
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ ListSpider.add_task(TaskStruct.new(href,
15
+ callback: method(:call_back),
16
+ errback: method(:err_back)))
17
+ end
18
+ end
19
+
20
+ def err_back(task_struct, http_req)
21
+ puts "failed"
22
+ puts http_req.response_header.status
23
+ end
24
+
25
+ ListSpider.save_file = false
26
+
27
+ # get_one is a simple function for one taskstruct situation
28
+ ListSpider.get_one(
29
+ TaskStruct.new(
30
+ 'https://coolshell.cn/',
31
+ callback: method(:call_back),
32
+ errback: method(:err_back)
33
+ )
34
+ )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.0
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-06 00:00:00.000000000 Z
11
+ date: 2020-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -117,6 +117,7 @@ files:
117
117
  - list_spider.gemspec
118
118
  - spider_example.rb
119
119
  - spider_example_2.rb
120
+ - spider_not_save_file.rb
120
121
  homepage: https://github.com/chinazhangchao/list_spider
121
122
  licenses:
122
123
  - MIT