list_spider 1.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
- data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
3
+ metadata.gz: 39600b837bb18841d083c7b50dbaadf82e72c3013f690129af6786efec193a39
4
+ data.tar.gz: 4128e673c551e3fcc2c1f9d4a3302407bcf7bc26829a4957d04ebc0505d5ce07
5
5
  SHA512:
6
- metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
- data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
6
+ metadata.gz: f900e8f76086f37239872d9b4452f5d735799100879ac16570d29c9570837adca52c3c9e37c725913920a68add7784bc2f94e2cef42663c54930ae5b3e37ec50
7
+ data.tar.gz: 90495a4dae2552c3f41e55f0efa61fef0511581eb2e13d90256e0a585c48f7fdb2af167cd8c6daa98ca80c2229d970187fbcec3db4a8edd43738f76f79c18951
data/.rdoc_options ADDED
@@ -0,0 +1,23 @@
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
data/.rubocop.yml CHANGED
@@ -18,9 +18,9 @@ Style/Documentation:
18
18
  Enabled: false
19
19
  Lint/AmbiguousRegexpLiteral:
20
20
  Enabled: false
21
- Lint/DefEndAlignment:
21
+ Layout/DefEndAlignment:
22
22
  AutoCorrect: true
23
- Lint/EndAlignment:
23
+ Layout/EndAlignment:
24
24
  AutoCorrect: true
25
25
  Style/BracesAroundHashParameters:
26
26
  Enabled: false
data/English_README.md ADDED
@@ -0,0 +1,169 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ list_spider (2.0.2)
5
+ em-http-request (~> 1.1, >= 1.1.3)
6
+ nokogiri (~> 1.6, >= 1.6.7)
7
+ rchardet (~> 1.6, >= 1.6.1)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.5.2)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ cookiejar (0.3.3)
15
+ em-http-request (1.1.5)
16
+ addressable (>= 2.3.4)
17
+ cookiejar (!= 0.3.1)
18
+ em-socksify (>= 0.3)
19
+ eventmachine (>= 1.0.3)
20
+ http_parser.rb (>= 0.6.0)
21
+ em-socksify (0.3.2)
22
+ eventmachine (>= 1.0.0.beta.4)
23
+ eventmachine (1.2.5)
24
+ http_parser.rb (0.6.0)
25
+ mini_portile2 (2.3.0)
26
+ nokogiri (1.8.2)
27
+ mini_portile2 (~> 2.3.0)
28
+ public_suffix (3.0.2)
29
+ rake (10.5.0)
30
+ rchardet (1.7.0)
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ bundler (~> 1.16)
37
+ list_spider!
38
+ rake (~> 10.0)
39
+
40
+ BUNDLED WITH
41
+ 1.16.1
data/README.md CHANGED
@@ -1,186 +1,181 @@
1
- # list_spider
1
+ # 关于list_spider
2
2
 
3
- A url list spider based on em-http-request.
3
+ list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
4
4
 
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
5
+ 许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
6
6
 
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
7
+ ## 功能特点
8
+ * 去重过滤 (使用本地文件路径做唯一性校验)
9
9
 
10
- * Convert to UTF-8 support.
10
+ * 支持UTF-8编码转换。
11
11
 
12
- * Increased spider support (don't spider exist).
12
+ * 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
13
13
 
14
- * Customize concurrent number and interval between task.
14
+ * 自由设置最大并发数和爬取任务间隔时间。
15
15
 
16
- * Http options support.
16
+ * 支持http所有选项设置。
17
17
 
18
- ## Getting started
18
+ ## 开始
19
19
 
20
- gem install list_spider
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ 或者添加到Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
21
29
 
22
- ## Use like this
30
+ ## 使用方法
23
31
  ```ruby
24
32
  require 'list_spider'
25
33
 
26
- DOWNLOAD_DIR = 'coolshell/'
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
27
35
 
28
- $next_list = []
36
+ @next_list = []
29
37
 
30
- def parse_index_item(file_name)
31
- content = File.read(file_name)
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
32
40
  doc = Nokogiri::HTML(content)
33
- list_group = doc.css("h2.entry-title")
34
- link_list = list_group.css("a")
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
35
43
 
36
44
  link_list.each do |link|
37
45
  href = link['href']
38
- local_path = DOWNLOAD_DIR + link.content + ".html"
39
- #or you can save them to database for later use
40
- $next_list<< TaskStruct.new(href, local_path)
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # 可以存入数据库后续处理
48
+ @next_list << TaskStruct.new(href, local_path)
41
49
  end
42
50
  end
43
51
 
44
52
  task_list = []
45
- task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
46
58
 
47
59
  ListSpider.get_list(task_list)
48
- ListSpider.get_list($next_list, max: 60)
49
-
60
+ ListSpider.get_list(@next_list, max: 60)
50
61
  ```
51
62
 
52
- ## Or in one step
63
+ ## 或者使用更简单的一步完成
53
64
  ```ruby
54
65
  require 'list_spider'
55
66
 
56
- DOWNLOAD_DIR = 'coolshell/'
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
57
68
 
58
- def parse_index_item(file_name)
59
-
60
- content = File.read(file_name)
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
61
71
  doc = Nokogiri::HTML(content)
62
- list_group = doc.css("h2.entry-title")
63
- link_list = list_group.css("a")
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
64
74
 
65
75
  link_list.each do |link|
66
76
  href = link['href']
67
- local_path = DOWNLOAD_DIR + link.content + ".html"
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
68
78
  ListSpider.add_task(TaskStruct.new(href, local_path))
69
79
  end
70
80
  end
71
81
 
72
- #get_one is a simple function for one taskstruct situation
73
- ListSpider.get_one(TaskStruct.new(
74
- 'https://coolshell.cn/',
75
- DOWNLOAD_DIR + 'index.html',
76
- parse_method: method(:parse_index_item)),
77
- max: 60)
78
-
82
+ # get_one是封装了get_list的简化形式,方便一个任务时调用
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
79
91
  ```
80
92
 
81
- ## You can define parse method in four forms
82
-
83
- ```ruby
84
- def parse_response(file_name)
85
- #...
86
- end
87
-
88
-
89
- # extra_data is passed by TaskStruct's extra_data param
90
-
91
- def parse_response(file_name, extra_data)
92
- #...
93
- end
94
-
95
-
96
- # response_header is a EventMachine::HttpResponseHeader object
97
- # you can use it like this:
98
- # response_header.status
99
- # response_header.cookie
100
- # response_header['Last-Modified']
101
-
102
- def parse_response(file_name, extra_data, response_header)
103
- response_header.status
104
- response_header['Last-Modified']
105
-
106
- #...
107
- end
108
-
109
- # req is a EventMachine::HttpClientOptions object
110
- # you can use it like this:
111
- # req.body
112
- # req.headers
113
- # req.uri
114
- # req.host
115
- # req.port
116
- def parse_response(file_name, extra_data, response_header, req)
117
- puts req.body
118
- puts req.headers
119
- puts req.uri
120
- puts req.host
121
- puts req.port
122
-
123
- #...
124
- end
125
-
126
- ```
127
-
128
- ## And there are many options you can use
129
-
130
- ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
93
+ ## get_list/get_one参数
132
94
  ```
95
+ # down_list: 要请求的TaskStruct数组
96
+ # interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
97
+ # max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
133
98
 
134
- ```ruby
135
- #no concurrent limit (note: only use when list size is small)
136
- ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
-
138
- #sleep random time, often used in site which limit spider
139
- ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
-
141
- #set random time range
142
- ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
-
99
+ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
+ get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
144
101
  ```
145
102
 
146
- ###Options below will take effect in the whole program (set them before call get_list)
103
+ ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
147
104
 
148
105
  ```ruby
149
- #set proxy
150
- ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
-
152
- #set http header (if TaskStruct has header it will be used priority)
153
- ListSpider.set_header_option(header_option)
154
-
155
- #convert the file encoding to utf-8
156
- ListSpider.convert_to_utf8 = true
157
-
158
- #set connect timeout
159
- ListSpider.connect_timeout = 2*60
160
-
161
- #over write exist file
162
- ListSpider.overwrite_exist = false
163
-
164
- #set redirect depth
165
- ListSpider.max_redirects = 10
166
-
106
+ new(href, # 请求链接
107
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
108
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
+ http_method: :get,
110
+ custom_data: nil, # 自定义数据
111
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
112
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
113
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
114
+ # http_req.response_header.status 状态码
115
+ # http_req.response_header 返回头
116
+ # http_req.response 返回体
117
+ callback: nil,
118
+ # 请求失败后的回调
119
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
120
+ errback: nil,
121
+ stream_callback: nil, # 流数据处理回调
122
+ convert_to_utf8: false, # 是否转换为utf8编码
123
+ overwrite_exist: false, # 是否覆盖现有文件
124
+ # 请求设置
125
+ redirects: 3, # 重定向次数
126
+ keepalive: nil, # (暂不支持复用)
127
+ file: nil, # 要上传的文件路径
128
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
129
+ query: nil, # 查询字符串,可以是string或hash类型
130
+ body: nil, # 请求体,可以是string或hash类型
131
+ head: nil, # 请求头
132
+ # 连接设置
133
+ connect_timeout: 60, # 连接超时时间
134
+ inactivity_timeout: nil, # 连接后超时时间
135
+ # ssl设置
136
+ # ssl: {
137
+ # :private_key_file => '/tmp/server.key',
138
+ # :cert_chain_file => '/tmp/server.crt',
139
+ # :verify_peer => false
140
+ # }
141
+ ssl: nil,
142
+ # bind: {
143
+ # :host => '123.123.123.123', # use a specific interface for outbound request
144
+ # :port => '123'
145
+ # }
146
+ bind: nil,
147
+ # 代理设置
148
+ # proxy: {
149
+ # :host => '127.0.0.1', # proxy address
150
+ # :port => 9000, # proxy port
151
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
152
+
153
+ # :authorization => ['user', 'pass'] # proxy authorization header
154
+ # }
155
+ proxy: nil)
167
156
  ```
168
157
 
169
- ## There is a util class to help check or delete unvalid file
158
+ ## 回调函数形式
170
159
 
171
160
  ```ruby
172
- FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
-
174
- FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
-
176
- FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
161
+ # 文件成功保存后调用,通过parse_method参数传入
162
+ def parse_eresponse(task_struct)
163
+ # ...
164
+ end
177
165
 
178
- #params
179
- FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
166
+ # http请求成功后调用,通过callback参数传入
167
+ def call_back(task_struct, http_req)
168
+ # http_req 是EventMachine::HttpRequest对象
169
+ # http_req.response_header.status
170
+ # ...
171
+ end
180
172
 
181
- FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
173
+ # http请求出错后调用,通过errback参数传入
174
+ def err_back(task_struct, http_req)
175
+ # ...
176
+ end
182
177
  ```
183
178
 
184
- ### License
179
+ ## License
185
180
 
186
181
  (MIT License) - Copyright (c) 2016 Charles Zhang
data/lib/file_filter.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  class FileFilter
3
3
  # 4033
4
4
  # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
5
+ def initialize(dir_pattern, size_threshold: 1000,
6
+ cust_judge: nil, process_block: nil)
6
7
  @dir_pattern = dir_pattern
7
8
  @size_threshold = size_threshold
8
9
  @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
53
54
  ).start
54
55
  end
55
56
 
56
- def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
57
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
+ size_threshold: 1000, cust_judge: nil)
57
59
  result_file = File.open(save_file_name, 'wt')
58
60
  FileFilter.new(
59
61
  dir_pattern,
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '2.2.0'.freeze
3
3
  end
data/lib/list_spider.rb CHANGED
@@ -4,26 +4,108 @@ require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
6
  require 'addressable/uri'
7
- require File.expand_path('../spider_helper', __FILE__)
8
- require File.expand_path('../file_filter', __FILE__)
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
9
 
10
+ # 爬取任务类
10
11
  class TaskStruct
11
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
+ def initialize(href, # 请求链接
18
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
+ http_method: :get,
21
+ custom_data: nil, # 自定义数据
22
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
28
+ callback: nil,
29
+ # 请求失败后的回调
30
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
+ errback: nil,
32
+ stream_callback: nil, # 流数据处理回调
33
+ convert_to_utf8: false, # 是否转换为utf8编码
34
+ overwrite_exist: false, # 是否覆盖现有文件
35
+ # 请求设置
36
+ redirects: 3, # 重定向次数
37
+ keepalive: nil, # (暂不支持复用)
38
+ file: nil, # 要上传的文件路径
39
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
+ query: nil, # 查询字符串,可以是string或hash类型
41
+ body: nil, # 请求体,可以是string或hash类型
42
+ head: nil, # 请求头
43
+ # 连接设置
44
+ connect_timeout: 60, # 连接超时时间
45
+ inactivity_timeout: nil, # 连接后超时时间
46
+ # ssl设置
47
+ # ssl: {
48
+ # :private_key_file => '/tmp/server.key',
49
+ # :cert_chain_file => '/tmp/server.crt',
50
+ # :verify_peer => false
51
+ # }
52
+ ssl: nil,
53
+ # bind: {
54
+ # :host => '123.123.123.123', # use a specific interface for outbound request
55
+ # :port => '123'
56
+ # }
57
+ bind: nil,
58
+ # 代理设置
59
+ # proxy: {
60
+ # :host => '127.0.0.1', # proxy address
61
+ # :port => 9000, # proxy port
62
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
+
64
+ # :authorization => ['user', 'pass'] # proxy authorization header
65
+ # }
66
+ proxy: nil)
12
67
  @href = href
13
- @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
68
  @local_path = local_path
15
69
  @http_method = http_method
16
- @params = params
17
- @extra_data = extra_data
70
+ @custom_data = custom_data
18
71
  @parse_method = parse_method
19
- @header = header
72
+ @callback = callback
73
+ @errback = errback
74
+ @stream_callback = stream_callback
75
+ @convert_to_utf8 = convert_to_utf8
76
+ @overwrite_exist = overwrite_exist
77
+
78
+ @request_options = {
79
+ redirects: redirects,
80
+ keepalive: keepalive,
81
+ file: file,
82
+ path: path,
83
+ query: query,
84
+ body: body,
85
+ head: head
86
+ }.compact
87
+
88
+ @connection_options = {
89
+ connect_timeout: connect_timeout,
90
+ inactivity_timeout: inactivity_timeout,
91
+ ssl: ssl,
92
+ bind: bind,
93
+ proxy: proxy
94
+ }.compact
20
95
  end
21
96
 
22
- def ==(other)
23
- other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
- end
25
-
26
- attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
97
+ attr_accessor :href, :local_path,
98
+ :http_method,
99
+ :custom_data,
100
+ :request_object,
101
+ :parse_method,
102
+ :callback,
103
+ :errback,
104
+ :stream_callback,
105
+ :convert_to_utf8,
106
+ :overwrite_exist,
107
+ :request_options,
108
+ :connection_options
27
109
  end
28
110
 
29
111
  module ListSpider
@@ -33,33 +115,44 @@ module ListSpider
33
115
  DEFAULT_INTERVAL = 0
34
116
 
35
117
  @random_time_range = 3..10
36
- @convert_to_utf8 = false
37
- @connection_opts = { connect_timeout: 60 }
38
- @overwrite_exist = false
39
- @max_redirects = 10
40
118
  @local_path_set = Set.new
41
119
 
42
120
  class << self
43
- attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
121
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
+ if interval.is_a? Range
123
+ @random_time_range = interval
124
+ interval = RANDOM_TIME
125
+ end
126
+
127
+ @down_list = filter_list(down_list)
128
+ @interval = interval
129
+ @max = max
130
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
+ @succeed_size = 0
132
+ @failed_size = 0
44
133
 
45
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
- @connection_opts = {
47
- proxy: {
48
- host: proxy_addr,
49
- port: proxy_port
50
- }
51
- }
52
- @connection_opts[:proxy][:authorization] = [username, password] if username && password
134
+ puts "total size:#{@down_list.size}"
135
+ event_machine_start_list(next_task, method(:complete))
53
136
  end
54
137
 
55
- def connect_timeout(max_connect_time)
56
- @connection_opts[:connect_timeout] = max_connect_time
138
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
+ get_list([task], interval: interval, max: max)
57
140
  end
58
141
 
59
- def set_header_option(header_option)
60
- @header_option = header_option
142
+ def add_task(task)
143
+ if task.is_a? Array
144
+ need_down_list = filter_list(task)
145
+ @down_list += need_down_list
146
+ elsif task.is_a?TaskStruct
147
+ need_down_list = filter_list([task])
148
+ @down_list += need_down_list
149
+ else
150
+ puts "error task type:#{task.class}"
151
+ end
61
152
  end
62
153
 
154
+ private
155
+
63
156
  def event_machine_down(link_struct_list, callback = nil)
64
157
  failed_list = []
65
158
  succeed_list = []
@@ -67,78 +160,47 @@ module ListSpider
67
160
  begin_time = Time.now
68
161
 
69
162
  for_each_proc =
70
- proc do |e|
71
- opt = { redirects: @max_redirects }
72
- if e.header
73
- opt[:head] = e.header
74
- elsif defined? @header_option
75
- opt[:head] = @header_option
76
- end
163
+ proc do |task_struct|
164
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
165
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
166
+ task_struct.request_object = http_req
77
167
 
78
- if e.http_method == :post
79
- opt[:body] = e.params unless e.params.empty?
80
- w =
81
- if @connection_opts
82
- EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
- else
84
- EventMachine::HttpRequest.new(e.href).post opt
85
- end
86
- else
87
- if @connection_opts
88
- opt[:query] = e.params unless e.params.empty?
89
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
- else
91
- w = EventMachine::HttpRequest.new(e.href).get opt
92
- end
93
- end
94
-
95
- e.request_object = w
168
+ http_req.callback do
169
+ s = http_req.response_header.status
170
+ puts "#{Time.now}, http status code: #{s}"
96
171
 
97
- w.callback do
98
- s = w.response_header.status
99
- puts s
100
- if s != 404
101
- local_dir = File.dirname(e.local_path)
172
+ if s == 200
173
+ local_dir = File.dirname(task_struct.local_path)
102
174
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
175
  begin
104
- File.open(e.local_path, 'wb') do |f|
176
+ File.open(task_struct.local_path, 'wb') do |f|
105
177
  f << if @convert_to_utf8 == true
106
- SpiderHelper.to_utf8(w.response)
178
+ SpiderHelper.to_utf8(http_req.response)
107
179
  else
108
- w.response
180
+ http_req.response
109
181
  end
110
182
  end
111
- succeed_list << e
112
- rescue StandardError => e
113
- puts e
183
+ call_parse_method(task_struct)
184
+ succeed_list << task_struct
185
+ rescue StandardError => exception
186
+ puts exception
114
187
  end
115
188
  end
189
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
116
190
  end
117
- w.errback do
118
- puts "errback:#{w.response_header},retry..."
119
- puts e.href
120
- puts w.response_header.status
121
191
 
122
- ret = false
123
- if e.http_method == :get
124
- ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
- elsif e.http_method == :post
126
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
127
- end
192
+ http_req.errback do
193
+ puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
128
194
 
129
- if ret
130
- succeed_list << e
131
- else
132
- failed_list << e
133
- end
195
+ task_struct.errback.call(task_struct, http_req) if task_struct.errback
134
196
  end
135
197
 
136
198
  begin
137
- multi.add e.local_path, w
199
+ multi.add task_struct.local_path, http_req
138
200
  rescue StandardError => exception
139
201
  puts exception
140
- puts e.href
141
- puts e.local_path
202
+ puts task_struct.href
203
+ puts task_struct.local_path
142
204
  stop_machine
143
205
  end
144
206
  end
@@ -170,38 +232,15 @@ module ListSpider
170
232
  @down_list.shift(@max)
171
233
  end
172
234
 
173
- def call_parse_method(e)
174
- pm = e.parse_method
175
- if pm
176
- case pm.arity
177
- when 1
178
- pm.call(e.local_path)
179
- when 2
180
- pm.call(e.local_path, e.extra_data)
181
- when 3
182
- res_header = nil
183
- res_header = e.request_object.response_header if e.request_object
184
- pm.call(e.local_path, e.extra_data, res_header)
185
- when 4
186
- res_header = nil
187
- res_header = e.request_object.response_header if e.request_object
188
-
189
- req = nil
190
- req = e.request_object.req if e.request_object
191
-
192
- pm.call(e.local_path, e.extra_data, res_header, req)
193
- else
194
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
195
- end
196
- end
235
+ def call_parse_method(task_struct)
236
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
197
237
  end
198
238
 
199
239
  def complete(_multi, success_list, failed_list)
200
240
  @succeed_size += success_list.size
201
241
  @failed_size += failed_list.size
202
- success_list.each do |e|
203
- call_parse_method(e)
204
- end
242
+ @succeed_list.concat(success_list)
243
+ @failed_list.concat(failed_list)
205
244
 
206
245
  todo = next_task
207
246
 
@@ -223,6 +262,8 @@ module ListSpider
223
262
 
224
263
  def event_machine_start_list(down_list, callback = nil)
225
264
  EventMachine.run do
265
+ @succeed_list = []
266
+ @failed_list = []
226
267
  @begin_time = Time.now
227
268
  if down_list.empty?
228
269
  if callback
@@ -239,7 +280,7 @@ module ListSpider
239
280
  def filter_list(down_list)
240
281
  need_down_list = []
241
282
  down_list.each do |ts|
242
- if !@overwrite_exist && File.exist?(ts.local_path)
283
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
243
284
  call_parse_method(ts)
244
285
  elsif @local_path_set.add?(ts.local_path)
245
286
  need_down_list << ts
@@ -247,43 +288,6 @@ module ListSpider
247
288
  end
248
289
  need_down_list
249
290
  end
250
-
251
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
252
- if interval.is_a? Range
253
- @random_time_range = interval
254
- interval = RANDOM_TIME
255
- end
256
-
257
- @down_list = []
258
-
259
- need_down_list = filter_list(down_list)
260
-
261
- @down_list += need_down_list
262
- @interval = interval
263
- @max = max
264
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
265
- @succeed_size = 0
266
- @failed_size = 0
267
-
268
- puts "total size:#{@down_list.size}"
269
- event_machine_start_list(next_task, method(:complete))
270
- end
271
-
272
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
273
- get_list([task], interval: interval, max: max)
274
- end
275
-
276
- def add_task(task)
277
- if task.is_a? Array
278
- need_down_list = filter_list(task)
279
- @down_list += need_down_list
280
- elsif task.is_a?TaskStruct
281
- need_down_list = filter_list([task])
282
- @down_list += need_down_list
283
- else
284
- puts "error task type:#{task.class}"
285
- end
286
- end
287
291
  end
288
292
 
289
293
  Signal.trap('INT') do
data/lib/spider_helper.rb CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
- href = string_to_uri(href) if href.class == ''.class
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
8
9
 
9
10
  begin
10
11
  href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
35
36
  false
36
37
  end
37
38
 
38
- def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
39
- href = string_to_uri(href) if href.class == ''.class
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
40
42
 
41
43
  begin
42
44
  req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
72
74
 
73
75
  def string_to_uri(href)
74
76
  l = href
75
- l.sub!('http:///', 'http://') if l.start_with?('http:///')
77
+ l.sub!('http:///', 'http://')
76
78
  l = Addressable::URI.parse(l)
77
79
  l.normalize!
78
80
  end
data/list_spider.gemspec CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'list_spider/version'
5
5
 
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.0'
27
27
 
28
28
  spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
29
+ spec.add_dependency 'nokogiri', '~> 1.11'
30
30
  spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
31
  end
data/spider_example.rb CHANGED
@@ -3,8 +3,8 @@ require 'list_spider'
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
6
- def parse_index_item(file_name)
7
- content = File.read(file_name)
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
8
  doc = Nokogiri::HTML(content)
9
9
  list_group = doc.css('h2.entry-title')
10
10
  link_list = list_group.css('a')
@@ -16,8 +16,6 @@ def parse_index_item(file_name)
16
16
  end
17
17
  end
18
18
 
19
- # ListSpider.convert_to_utf8 = true
20
-
21
19
  # get_one is a simple function for one taskstruct situation
22
20
  ListSpider.get_one(
23
21
  TaskStruct.new(
data/spider_example_2.rb CHANGED
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
4
4
 
5
5
  @next_list = []
6
6
 
7
- def parse_index_item(file_name)
8
- content = File.read(file_name)
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
9
  doc = Nokogiri::HTML(content)
10
10
  list_group = doc.css('h2.entry-title')
11
11
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-01-29 00:00:00.000000000 Z
11
+ date: 2019-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -64,20 +64,14 @@ dependencies:
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: '1.6'
68
- - - ">="
69
- - !ruby/object:Gem::Version
70
- version: 1.6.7
67
+ version: '1.11'
71
68
  type: :runtime
72
69
  prerelease: false
73
70
  version_requirements: !ruby/object:Gem::Requirement
74
71
  requirements:
75
72
  - - "~>"
76
73
  - !ruby/object:Gem::Version
77
- version: '1.6'
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- version: 1.6.7
74
+ version: '1.11'
81
75
  - !ruby/object:Gem::Dependency
82
76
  name: rchardet
83
77
  requirement: !ruby/object:Gem::Requirement
@@ -106,8 +100,11 @@ extensions: []
106
100
  extra_rdoc_files: []
107
101
  files:
108
102
  - ".gitignore"
103
+ - ".rdoc_options"
109
104
  - ".rubocop.yml"
105
+ - English_README.md
110
106
  - Gemfile
107
+ - Gemfile.lock
111
108
  - README.md
112
109
  - Rakefile
113
110
  - bin/console
@@ -139,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
136
  - !ruby/object:Gem::Version
140
137
  version: '0'
141
138
  requirements: []
142
- rubyforge_project:
143
- rubygems_version: 2.7.3
139
+ rubygems_version: 3.0.3
144
140
  signing_key:
145
141
  specification_version: 4
146
142
  summary: List Spider