list_spider 1.0.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
- data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
3
+ metadata.gz: 39600b837bb18841d083c7b50dbaadf82e72c3013f690129af6786efec193a39
4
+ data.tar.gz: 4128e673c551e3fcc2c1f9d4a3302407bcf7bc26829a4957d04ebc0505d5ce07
5
5
  SHA512:
6
- metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
- data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
6
+ metadata.gz: f900e8f76086f37239872d9b4452f5d735799100879ac16570d29c9570837adca52c3c9e37c725913920a68add7784bc2f94e2cef42663c54930ae5b3e37ec50
7
+ data.tar.gz: 90495a4dae2552c3f41e55f0efa61fef0511581eb2e13d90256e0a585c48f7fdb2af167cd8c6daa98ca80c2229d970187fbcec3db4a8edd43738f76f79c18951
data/.rdoc_options ADDED
@@ -0,0 +1,23 @@
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
data/.rubocop.yml CHANGED
@@ -18,9 +18,9 @@ Style/Documentation:
18
18
  Enabled: false
19
19
  Lint/AmbiguousRegexpLiteral:
20
20
  Enabled: false
21
- Lint/DefEndAlignment:
21
+ Layout/DefEndAlignment:
22
22
  AutoCorrect: true
23
- Lint/EndAlignment:
23
+ Layout/EndAlignment:
24
24
  AutoCorrect: true
25
25
  Style/BracesAroundHashParameters:
26
26
  Enabled: false
data/English_README.md ADDED
@@ -0,0 +1,169 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang
data/Gemfile.lock ADDED
@@ -0,0 +1,41 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ list_spider (2.0.2)
5
+ em-http-request (~> 1.1, >= 1.1.3)
6
+ nokogiri (~> 1.6, >= 1.6.7)
7
+ rchardet (~> 1.6, >= 1.6.1)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.5.2)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ cookiejar (0.3.3)
15
+ em-http-request (1.1.5)
16
+ addressable (>= 2.3.4)
17
+ cookiejar (!= 0.3.1)
18
+ em-socksify (>= 0.3)
19
+ eventmachine (>= 1.0.3)
20
+ http_parser.rb (>= 0.6.0)
21
+ em-socksify (0.3.2)
22
+ eventmachine (>= 1.0.0.beta.4)
23
+ eventmachine (1.2.5)
24
+ http_parser.rb (0.6.0)
25
+ mini_portile2 (2.3.0)
26
+ nokogiri (1.8.2)
27
+ mini_portile2 (~> 2.3.0)
28
+ public_suffix (3.0.2)
29
+ rake (10.5.0)
30
+ rchardet (1.7.0)
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ bundler (~> 1.16)
37
+ list_spider!
38
+ rake (~> 10.0)
39
+
40
+ BUNDLED WITH
41
+ 1.16.1
data/README.md CHANGED
@@ -1,186 +1,181 @@
1
- # list_spider
1
+ # 关于list_spider
2
2
 
3
- A url list spider based on em-http-request.
3
+ list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
4
4
 
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
5
+ 许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
6
6
 
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
7
+ ## 功能特点
8
+ * 去重过滤 (使用本地文件路径做唯一性校验)
9
9
 
10
- * Convert to UTF-8 support.
10
+ * 支持UTF-8编码转换。
11
11
 
12
- * Increased spider support (don't spider exist).
12
+ * 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
13
13
 
14
- * Customize concurrent number and interval between task.
14
+ * 自由设置最大并发数和爬取任务间隔时间。
15
15
 
16
- * Http options support.
16
+ * 支持http所有选项设置。
17
17
 
18
- ## Getting started
18
+ ## 开始
19
19
 
20
- gem install list_spider
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ 或者添加到Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
21
29
 
22
- ## Use like this
30
+ ## 使用方法
23
31
  ```ruby
24
32
  require 'list_spider'
25
33
 
26
- DOWNLOAD_DIR = 'coolshell/'
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
27
35
 
28
- $next_list = []
36
+ @next_list = []
29
37
 
30
- def parse_index_item(file_name)
31
- content = File.read(file_name)
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
32
40
  doc = Nokogiri::HTML(content)
33
- list_group = doc.css("h2.entry-title")
34
- link_list = list_group.css("a")
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
35
43
 
36
44
  link_list.each do |link|
37
45
  href = link['href']
38
- local_path = DOWNLOAD_DIR + link.content + ".html"
39
- #or you can save them to database for later use
40
- $next_list<< TaskStruct.new(href, local_path)
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # 可以存入数据库后续处理
48
+ @next_list << TaskStruct.new(href, local_path)
41
49
  end
42
50
  end
43
51
 
44
52
  task_list = []
45
- task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
46
58
 
47
59
  ListSpider.get_list(task_list)
48
- ListSpider.get_list($next_list, max: 60)
49
-
60
+ ListSpider.get_list(@next_list, max: 60)
50
61
  ```
51
62
 
52
- ## Or in one step
63
+ ## 或者使用更简单的一步完成
53
64
  ```ruby
54
65
  require 'list_spider'
55
66
 
56
- DOWNLOAD_DIR = 'coolshell/'
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
57
68
 
58
- def parse_index_item(file_name)
59
-
60
- content = File.read(file_name)
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
61
71
  doc = Nokogiri::HTML(content)
62
- list_group = doc.css("h2.entry-title")
63
- link_list = list_group.css("a")
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
64
74
 
65
75
  link_list.each do |link|
66
76
  href = link['href']
67
- local_path = DOWNLOAD_DIR + link.content + ".html"
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
68
78
  ListSpider.add_task(TaskStruct.new(href, local_path))
69
79
  end
70
80
  end
71
81
 
72
- #get_one is a simple function for one taskstruct situation
73
- ListSpider.get_one(TaskStruct.new(
74
- 'https://coolshell.cn/',
75
- DOWNLOAD_DIR + 'index.html',
76
- parse_method: method(:parse_index_item)),
77
- max: 60)
78
-
82
+ # get_one是封装了get_list的简化形式,方便一个任务时调用
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
79
91
  ```
80
92
 
81
- ## You can define parse method in four forms
82
-
83
- ```ruby
84
- def parse_response(file_name)
85
- #...
86
- end
87
-
88
-
89
- # extra_data is passed by TaskStruct's extra_data param
90
-
91
- def parse_response(file_name, extra_data)
92
- #...
93
- end
94
-
95
-
96
- # response_header is a EventMachine::HttpResponseHeader object
97
- # you can use it like this:
98
- # response_header.status
99
- # response_header.cookie
100
- # response_header['Last-Modified']
101
-
102
- def parse_response(file_name, extra_data, response_header)
103
- response_header.status
104
- response_header['Last-Modified']
105
-
106
- #...
107
- end
108
-
109
- # req is a EventMachine::HttpClientOptions object
110
- # you can use it like this:
111
- # req.body
112
- # req.headers
113
- # req.uri
114
- # req.host
115
- # req.port
116
- def parse_response(file_name, extra_data, response_header, req)
117
- puts req.body
118
- puts req.headers
119
- puts req.uri
120
- puts req.host
121
- puts req.port
122
-
123
- #...
124
- end
125
-
126
- ```
127
-
128
- ## And there are many options you can use
129
-
130
- ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
93
+ ## get_list/get_one参数
132
94
  ```
95
+ # down_list: 要请求的TaskStruct数组
96
+ # interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
97
+ # max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
133
98
 
134
- ```ruby
135
- #no concurrent limit (note: only use when list size is small)
136
- ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
-
138
- #sleep random time, often used in site which limit spider
139
- ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
-
141
- #set random time range
142
- ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
-
99
+ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
+ get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
144
101
  ```
145
102
 
146
- ###Options below will take effect in the whole program (set them before call get_list)
103
+ ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
147
104
 
148
105
  ```ruby
149
- #set proxy
150
- ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
-
152
- #set http header (if TaskStruct has header it will be used priority)
153
- ListSpider.set_header_option(header_option)
154
-
155
- #convert the file encoding to utf-8
156
- ListSpider.convert_to_utf8 = true
157
-
158
- #set connect timeout
159
- ListSpider.connect_timeout = 2*60
160
-
161
- #over write exist file
162
- ListSpider.overwrite_exist = false
163
-
164
- #set redirect depth
165
- ListSpider.max_redirects = 10
166
-
106
+ new(href, # 请求链接
107
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
108
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
+ http_method: :get,
110
+ custom_data: nil, # 自定义数据
111
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
112
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
113
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
114
+ # http_req.response_header.status 状态码
115
+ # http_req.response_header 返回头
116
+ # http_req.response 返回体
117
+ callback: nil,
118
+ # 请求失败后的回调
119
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
120
+ errback: nil,
121
+ stream_callback: nil, # 流数据处理回调
122
+ convert_to_utf8: false, # 是否转换为utf8编码
123
+ overwrite_exist: false, # 是否覆盖现有文件
124
+ # 请求设置
125
+ redirects: 3, # 重定向次数
126
+ keepalive: nil, # (暂不支持复用)
127
+ file: nil, # 要上传的文件路径
128
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
129
+ query: nil, # 查询字符串,可以是string或hash类型
130
+ body: nil, # 请求体,可以是string或hash类型
131
+ head: nil, # 请求头
132
+ # 连接设置
133
+ connect_timeout: 60, # 连接超时时间
134
+ inactivity_timeout: nil, # 连接后超时时间
135
+ # ssl设置
136
+ # ssl: {
137
+ # :private_key_file => '/tmp/server.key',
138
+ # :cert_chain_file => '/tmp/server.crt',
139
+ # :verify_peer => false
140
+ # }
141
+ ssl: nil,
142
+ # bind: {
143
+ # :host => '123.123.123.123', # use a specific interface for outbound request
144
+ # :port => '123'
145
+ # }
146
+ bind: nil,
147
+ # 代理设置
148
+ # proxy: {
149
+ # :host => '127.0.0.1', # proxy address
150
+ # :port => 9000, # proxy port
151
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
152
+
153
+ # :authorization => ['user', 'pass'] # proxy authorization header
154
+ # }
155
+ proxy: nil)
167
156
  ```
168
157
 
169
- ## There is a util class to help check or delete unvalid file
158
+ ## 回调函数形式
170
159
 
171
160
  ```ruby
172
- FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
-
174
- FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
-
176
- FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
161
+ # 文件成功保存后调用,通过parse_method参数传入
162
+ def parse_eresponse(task_struct)
163
+ # ...
164
+ end
177
165
 
178
- #params
179
- FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
166
+ # http请求成功后调用,通过callback参数传入
167
+ def call_back(task_struct, http_req)
168
+ # http_req 是EventMachine::HttpRequest对象
169
+ # http_req.response_header.status
170
+ # ...
171
+ end
180
172
 
181
- FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
173
+ # http请求出错后调用,通过errback参数传入
174
+ def err_back(task_struct, http_req)
175
+ # ...
176
+ end
182
177
  ```
183
178
 
184
- ### License
179
+ ## License
185
180
 
186
181
  (MIT License) - Copyright (c) 2016 Charles Zhang
data/lib/file_filter.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  class FileFilter
3
3
  # 4033
4
4
  # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
5
+ def initialize(dir_pattern, size_threshold: 1000,
6
+ cust_judge: nil, process_block: nil)
6
7
  @dir_pattern = dir_pattern
7
8
  @size_threshold = size_threshold
8
9
  @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
53
54
  ).start
54
55
  end
55
56
 
56
- def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
57
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
+ size_threshold: 1000, cust_judge: nil)
57
59
  result_file = File.open(save_file_name, 'wt')
58
60
  FileFilter.new(
59
61
  dir_pattern,
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '2.2.0'.freeze
3
3
  end
data/lib/list_spider.rb CHANGED
@@ -4,26 +4,108 @@ require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
6
  require 'addressable/uri'
7
- require File.expand_path('../spider_helper', __FILE__)
8
- require File.expand_path('../file_filter', __FILE__)
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
9
 
10
+ # 爬取任务类
10
11
  class TaskStruct
11
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
+ def initialize(href, # 请求链接
18
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
+ http_method: :get,
21
+ custom_data: nil, # 自定义数据
22
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
28
+ callback: nil,
29
+ # 请求失败后的回调
30
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
+ errback: nil,
32
+ stream_callback: nil, # 流数据处理回调
33
+ convert_to_utf8: false, # 是否转换为utf8编码
34
+ overwrite_exist: false, # 是否覆盖现有文件
35
+ # 请求设置
36
+ redirects: 3, # 重定向次数
37
+ keepalive: nil, # (暂不支持复用)
38
+ file: nil, # 要上传的文件路径
39
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
+ query: nil, # 查询字符串,可以是string或hash类型
41
+ body: nil, # 请求体,可以是string或hash类型
42
+ head: nil, # 请求头
43
+ # 连接设置
44
+ connect_timeout: 60, # 连接超时时间
45
+ inactivity_timeout: nil, # 连接后超时时间
46
+ # ssl设置
47
+ # ssl: {
48
+ # :private_key_file => '/tmp/server.key',
49
+ # :cert_chain_file => '/tmp/server.crt',
50
+ # :verify_peer => false
51
+ # }
52
+ ssl: nil,
53
+ # bind: {
54
+ # :host => '123.123.123.123', # use a specific interface for outbound request
55
+ # :port => '123'
56
+ # }
57
+ bind: nil,
58
+ # 代理设置
59
+ # proxy: {
60
+ # :host => '127.0.0.1', # proxy address
61
+ # :port => 9000, # proxy port
62
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
+
64
+ # :authorization => ['user', 'pass'] # proxy authorization header
65
+ # }
66
+ proxy: nil)
12
67
  @href = href
13
- @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
68
  @local_path = local_path
15
69
  @http_method = http_method
16
- @params = params
17
- @extra_data = extra_data
70
+ @custom_data = custom_data
18
71
  @parse_method = parse_method
19
- @header = header
72
+ @callback = callback
73
+ @errback = errback
74
+ @stream_callback = stream_callback
75
+ @convert_to_utf8 = convert_to_utf8
76
+ @overwrite_exist = overwrite_exist
77
+
78
+ @request_options = {
79
+ redirects: redirects,
80
+ keepalive: keepalive,
81
+ file: file,
82
+ path: path,
83
+ query: query,
84
+ body: body,
85
+ head: head
86
+ }.compact
87
+
88
+ @connection_options = {
89
+ connect_timeout: connect_timeout,
90
+ inactivity_timeout: inactivity_timeout,
91
+ ssl: ssl,
92
+ bind: bind,
93
+ proxy: proxy
94
+ }.compact
20
95
  end
21
96
 
22
- def ==(other)
23
- other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
- end
25
-
26
- attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
97
+ attr_accessor :href, :local_path,
98
+ :http_method,
99
+ :custom_data,
100
+ :request_object,
101
+ :parse_method,
102
+ :callback,
103
+ :errback,
104
+ :stream_callback,
105
+ :convert_to_utf8,
106
+ :overwrite_exist,
107
+ :request_options,
108
+ :connection_options
27
109
  end
28
110
 
29
111
  module ListSpider
@@ -33,33 +115,44 @@ module ListSpider
33
115
  DEFAULT_INTERVAL = 0
34
116
 
35
117
  @random_time_range = 3..10
36
- @convert_to_utf8 = false
37
- @connection_opts = { connect_timeout: 60 }
38
- @overwrite_exist = false
39
- @max_redirects = 10
40
118
  @local_path_set = Set.new
41
119
 
42
120
  class << self
43
- attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
121
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
+ if interval.is_a? Range
123
+ @random_time_range = interval
124
+ interval = RANDOM_TIME
125
+ end
126
+
127
+ @down_list = filter_list(down_list)
128
+ @interval = interval
129
+ @max = max
130
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
+ @succeed_size = 0
132
+ @failed_size = 0
44
133
 
45
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
- @connection_opts = {
47
- proxy: {
48
- host: proxy_addr,
49
- port: proxy_port
50
- }
51
- }
52
- @connection_opts[:proxy][:authorization] = [username, password] if username && password
134
+ puts "total size:#{@down_list.size}"
135
+ event_machine_start_list(next_task, method(:complete))
53
136
  end
54
137
 
55
- def connect_timeout(max_connect_time)
56
- @connection_opts[:connect_timeout] = max_connect_time
138
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
+ get_list([task], interval: interval, max: max)
57
140
  end
58
141
 
59
- def set_header_option(header_option)
60
- @header_option = header_option
142
+ def add_task(task)
143
+ if task.is_a? Array
144
+ need_down_list = filter_list(task)
145
+ @down_list += need_down_list
146
+ elsif task.is_a?TaskStruct
147
+ need_down_list = filter_list([task])
148
+ @down_list += need_down_list
149
+ else
150
+ puts "error task type:#{task.class}"
151
+ end
61
152
  end
62
153
 
154
+ private
155
+
63
156
  def event_machine_down(link_struct_list, callback = nil)
64
157
  failed_list = []
65
158
  succeed_list = []
@@ -67,78 +160,47 @@ module ListSpider
67
160
  begin_time = Time.now
68
161
 
69
162
  for_each_proc =
70
- proc do |e|
71
- opt = { redirects: @max_redirects }
72
- if e.header
73
- opt[:head] = e.header
74
- elsif defined? @header_option
75
- opt[:head] = @header_option
76
- end
163
+ proc do |task_struct|
164
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
165
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
166
+ task_struct.request_object = http_req
77
167
 
78
- if e.http_method == :post
79
- opt[:body] = e.params unless e.params.empty?
80
- w =
81
- if @connection_opts
82
- EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
- else
84
- EventMachine::HttpRequest.new(e.href).post opt
85
- end
86
- else
87
- if @connection_opts
88
- opt[:query] = e.params unless e.params.empty?
89
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
- else
91
- w = EventMachine::HttpRequest.new(e.href).get opt
92
- end
93
- end
94
-
95
- e.request_object = w
168
+ http_req.callback do
169
+ s = http_req.response_header.status
170
+ puts "#{Time.now}, http status code: #{s}"
96
171
 
97
- w.callback do
98
- s = w.response_header.status
99
- puts s
100
- if s != 404
101
- local_dir = File.dirname(e.local_path)
172
+ if s == 200
173
+ local_dir = File.dirname(task_struct.local_path)
102
174
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
175
  begin
104
- File.open(e.local_path, 'wb') do |f|
176
+ File.open(task_struct.local_path, 'wb') do |f|
105
177
  f << if @convert_to_utf8 == true
106
- SpiderHelper.to_utf8(w.response)
178
+ SpiderHelper.to_utf8(http_req.response)
107
179
  else
108
- w.response
180
+ http_req.response
109
181
  end
110
182
  end
111
- succeed_list << e
112
- rescue StandardError => e
113
- puts e
183
+ call_parse_method(task_struct)
184
+ succeed_list << task_struct
185
+ rescue StandardError => exception
186
+ puts exception
114
187
  end
115
188
  end
189
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
116
190
  end
117
- w.errback do
118
- puts "errback:#{w.response_header},retry..."
119
- puts e.href
120
- puts w.response_header.status
121
191
 
122
- ret = false
123
- if e.http_method == :get
124
- ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
- elsif e.http_method == :post
126
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
127
- end
192
+ http_req.errback do
193
+ puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
128
194
 
129
- if ret
130
- succeed_list << e
131
- else
132
- failed_list << e
133
- end
195
+ task_struct.errback.call(task_struct, http_req) if task_struct.errback
134
196
  end
135
197
 
136
198
  begin
137
- multi.add e.local_path, w
199
+ multi.add task_struct.local_path, http_req
138
200
  rescue StandardError => exception
139
201
  puts exception
140
- puts e.href
141
- puts e.local_path
202
+ puts task_struct.href
203
+ puts task_struct.local_path
142
204
  stop_machine
143
205
  end
144
206
  end
@@ -170,38 +232,15 @@ module ListSpider
170
232
  @down_list.shift(@max)
171
233
  end
172
234
 
173
- def call_parse_method(e)
174
- pm = e.parse_method
175
- if pm
176
- case pm.arity
177
- when 1
178
- pm.call(e.local_path)
179
- when 2
180
- pm.call(e.local_path, e.extra_data)
181
- when 3
182
- res_header = nil
183
- res_header = e.request_object.response_header if e.request_object
184
- pm.call(e.local_path, e.extra_data, res_header)
185
- when 4
186
- res_header = nil
187
- res_header = e.request_object.response_header if e.request_object
188
-
189
- req = nil
190
- req = e.request_object.req if e.request_object
191
-
192
- pm.call(e.local_path, e.extra_data, res_header, req)
193
- else
194
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
195
- end
196
- end
235
+ def call_parse_method(task_struct)
236
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
197
237
  end
198
238
 
199
239
  def complete(_multi, success_list, failed_list)
200
240
  @succeed_size += success_list.size
201
241
  @failed_size += failed_list.size
202
- success_list.each do |e|
203
- call_parse_method(e)
204
- end
242
+ @succeed_list.concat(success_list)
243
+ @failed_list.concat(failed_list)
205
244
 
206
245
  todo = next_task
207
246
 
@@ -223,6 +262,8 @@ module ListSpider
223
262
 
224
263
  def event_machine_start_list(down_list, callback = nil)
225
264
  EventMachine.run do
265
+ @succeed_list = []
266
+ @failed_list = []
226
267
  @begin_time = Time.now
227
268
  if down_list.empty?
228
269
  if callback
@@ -239,7 +280,7 @@ module ListSpider
239
280
  def filter_list(down_list)
240
281
  need_down_list = []
241
282
  down_list.each do |ts|
242
- if !@overwrite_exist && File.exist?(ts.local_path)
283
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
243
284
  call_parse_method(ts)
244
285
  elsif @local_path_set.add?(ts.local_path)
245
286
  need_down_list << ts
@@ -247,43 +288,6 @@ module ListSpider
247
288
  end
248
289
  need_down_list
249
290
  end
250
-
251
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
252
- if interval.is_a? Range
253
- @random_time_range = interval
254
- interval = RANDOM_TIME
255
- end
256
-
257
- @down_list = []
258
-
259
- need_down_list = filter_list(down_list)
260
-
261
- @down_list += need_down_list
262
- @interval = interval
263
- @max = max
264
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
265
- @succeed_size = 0
266
- @failed_size = 0
267
-
268
- puts "total size:#{@down_list.size}"
269
- event_machine_start_list(next_task, method(:complete))
270
- end
271
-
272
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
273
- get_list([task], interval: interval, max: max)
274
- end
275
-
276
- def add_task(task)
277
- if task.is_a? Array
278
- need_down_list = filter_list(task)
279
- @down_list += need_down_list
280
- elsif task.is_a?TaskStruct
281
- need_down_list = filter_list([task])
282
- @down_list += need_down_list
283
- else
284
- puts "error task type:#{task.class}"
285
- end
286
- end
287
291
  end
288
292
 
289
293
  Signal.trap('INT') do
data/lib/spider_helper.rb CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
- href = string_to_uri(href) if href.class == ''.class
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
8
9
 
9
10
  begin
10
11
  href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
35
36
  false
36
37
  end
37
38
 
38
- def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
39
- href = string_to_uri(href) if href.class == ''.class
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
40
42
 
41
43
  begin
42
44
  req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
72
74
 
73
75
  def string_to_uri(href)
74
76
  l = href
75
- l.sub!('http:///', 'http://') if l.start_with?('http:///')
77
+ l.sub!('http:///', 'http://')
76
78
  l = Addressable::URI.parse(l)
77
79
  l.normalize!
78
80
  end
data/list_spider.gemspec CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'list_spider/version'
5
5
 
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.0'
27
27
 
28
28
  spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
29
+ spec.add_dependency 'nokogiri', '~> 1.11'
30
30
  spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
31
  end
data/spider_example.rb CHANGED
@@ -3,8 +3,8 @@ require 'list_spider'
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
6
- def parse_index_item(file_name)
7
- content = File.read(file_name)
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
8
  doc = Nokogiri::HTML(content)
9
9
  list_group = doc.css('h2.entry-title')
10
10
  link_list = list_group.css('a')
@@ -16,8 +16,6 @@ def parse_index_item(file_name)
16
16
  end
17
17
  end
18
18
 
19
- # ListSpider.convert_to_utf8 = true
20
-
21
19
  # get_one is a simple function for one taskstruct situation
22
20
  ListSpider.get_one(
23
21
  TaskStruct.new(
data/spider_example_2.rb CHANGED
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
4
4
 
5
5
  @next_list = []
6
6
 
7
- def parse_index_item(file_name)
8
- content = File.read(file_name)
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
9
  doc = Nokogiri::HTML(content)
10
10
  list_group = doc.css('h2.entry-title')
11
11
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-01-29 00:00:00.000000000 Z
11
+ date: 2019-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -64,20 +64,14 @@ dependencies:
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: '1.6'
68
- - - ">="
69
- - !ruby/object:Gem::Version
70
- version: 1.6.7
67
+ version: '1.11'
71
68
  type: :runtime
72
69
  prerelease: false
73
70
  version_requirements: !ruby/object:Gem::Requirement
74
71
  requirements:
75
72
  - - "~>"
76
73
  - !ruby/object:Gem::Version
77
- version: '1.6'
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- version: 1.6.7
74
+ version: '1.11'
81
75
  - !ruby/object:Gem::Dependency
82
76
  name: rchardet
83
77
  requirement: !ruby/object:Gem::Requirement
@@ -106,8 +100,11 @@ extensions: []
106
100
  extra_rdoc_files: []
107
101
  files:
108
102
  - ".gitignore"
103
+ - ".rdoc_options"
109
104
  - ".rubocop.yml"
105
+ - English_README.md
110
106
  - Gemfile
107
+ - Gemfile.lock
111
108
  - README.md
112
109
  - Rakefile
113
110
  - bin/console
@@ -139,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
136
  - !ruby/object:Gem::Version
140
137
  version: '0'
141
138
  requirements: []
142
- rubyforge_project:
143
- rubygems_version: 2.7.3
139
+ rubygems_version: 3.0.3
144
140
  signing_key:
145
141
  specification_version: 4
146
142
  summary: List Spider