list_spider 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a69a63919c971b077855f236631b5b91895e11836338c8970dec65214f92a4c
4
- data.tar.gz: 6e0fefdd9a9eba8bfbead303bfbe2e27a0d98171bd60cdae89864f026289c987
3
+ metadata.gz: 2af55a6c3577dc734aa7ee545cef217059abfc7be4724eaac9cf94126b869b0e
4
+ data.tar.gz: 48e8f116b91e36613b05958f173a1bc168c0c6daa163fd137266515c3a19c2b7
5
5
  SHA512:
6
- metadata.gz: 11a2591f94021e7bb3d06bb7a712f98cf8329142150cf09979d44cc65f1803c809e9077d85ad5d41c83457b20c2bf9694a3a65692898145eb7738e18460e40ba
7
- data.tar.gz: 3e393c808733042ba3b13a950fcc63f7498641a55d16a5ccae11f3bf1100ca04558e32873bd9063602abd6c72194e8d90492b9b292e47cbd08bfc44476f5417f
6
+ metadata.gz: 778ae0918059fd2edea3a02081cf479d054521f216afa789f4d9131708b8339486f39b9f7603e303de91f4c03b1bb7ebf30e6b45ac0921fe0c29640743df9e5d
7
+ data.tar.gz: bcfc6df857085630faf802f3cff9d21653c2d8ced9b2595a3bc92a8093d8883cd470132b770801bc2e4977ad17e5f82aea726a2ad600ab5d1560150dede7c20f
@@ -0,0 +1,23 @@
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
@@ -18,9 +18,9 @@ Style/Documentation:
18
18
  Enabled: false
19
19
  Lint/AmbiguousRegexpLiteral:
20
20
  Enabled: false
21
- Lint/DefEndAlignment:
21
+ Layout/DefEndAlignment:
22
22
  AutoCorrect: true
23
- Lint/EndAlignment:
23
+ Layout/EndAlignment:
24
24
  AutoCorrect: true
25
25
  Style/BracesAroundHashParameters:
26
26
  Enabled: false
@@ -0,0 +1,169 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- list_spider (2.0.1)
4
+ list_spider (2.0.2)
5
5
  em-http-request (~> 1.1, >= 1.1.3)
6
6
  nokogiri (~> 1.6, >= 1.6.7)
7
7
  rchardet (~> 1.6, >= 1.6.1)
data/README.md CHANGED
@@ -1,186 +1,181 @@
1
- # list_spider
1
+ # 关于list_spider
2
2
 
3
- A url list spider based on em-http-request.
3
+ list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
4
4
 
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
5
+ 许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
6
6
 
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
7
+ ## 功能特点
8
+ * 去重过滤 (使用本地文件路径做唯一性校验)
9
9
 
10
- * Convert to UTF-8 support.
10
+ * 支持UTF-8编码转换。
11
11
 
12
- * Increased spider support (don't spider exist).
12
+ * 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
13
13
 
14
- * Customize concurrent number and interval between task.
14
+ * 自由设置最大并发数和爬取任务间隔时间。
15
15
 
16
- * Http options support.
16
+ * 支持http所有选项设置。
17
17
 
18
- ## Getting started
18
+ ## 开始
19
19
 
20
- gem install list_spider
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ 或者添加到Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
21
29
 
22
- ## Use like this
30
+ ## 使用方法
23
31
  ```ruby
24
32
  require 'list_spider'
25
33
 
26
- DOWNLOAD_DIR = 'coolshell/'
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
27
35
 
28
- $next_list = []
36
+ @next_list = []
29
37
 
30
- def parse_index_item(file_name)
31
- content = File.read(file_name)
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
32
40
  doc = Nokogiri::HTML(content)
33
- list_group = doc.css("h2.entry-title")
34
- link_list = list_group.css("a")
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
35
43
 
36
44
  link_list.each do |link|
37
45
  href = link['href']
38
- local_path = DOWNLOAD_DIR + link.content + ".html"
39
- #or you can save them to database for later use
40
- $next_list<< TaskStruct.new(href, local_path)
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # 可以存入数据库后续处理
48
+ @next_list << TaskStruct.new(href, local_path)
41
49
  end
42
50
  end
43
51
 
44
52
  task_list = []
45
- task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
46
58
 
47
59
  ListSpider.get_list(task_list)
48
- ListSpider.get_list($next_list, max: 60)
49
-
60
+ ListSpider.get_list(@next_list, max: 60)
50
61
  ```
51
62
 
52
- ## Or in one step
63
+ ## 或者使用更简单的一步完成
53
64
  ```ruby
54
65
  require 'list_spider'
55
66
 
56
- DOWNLOAD_DIR = 'coolshell/'
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
57
68
 
58
- def parse_index_item(file_name)
59
-
60
- content = File.read(file_name)
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
61
71
  doc = Nokogiri::HTML(content)
62
- list_group = doc.css("h2.entry-title")
63
- link_list = list_group.css("a")
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
64
74
 
65
75
  link_list.each do |link|
66
76
  href = link['href']
67
- local_path = DOWNLOAD_DIR + link.content + ".html"
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
68
78
  ListSpider.add_task(TaskStruct.new(href, local_path))
69
79
  end
70
80
  end
71
81
 
72
- #get_one is a simple function for one taskstruct situation
73
- ListSpider.get_one(TaskStruct.new(
74
- 'https://coolshell.cn/',
75
- DOWNLOAD_DIR + 'index.html',
76
- parse_method: method(:parse_index_item)),
77
- max: 60)
78
-
82
+ # get_one是封装了get_list的简化形式,方便一个任务时调用
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
79
91
  ```
80
92
 
81
- ## You can define parse method in four forms
82
-
83
- ```ruby
84
- def parse_response(file_name)
85
- #...
86
- end
87
-
88
-
89
- # custom_data is passed by TaskStruct's custom_data param
90
-
91
- def parse_response(file_name, custom_data)
92
- #...
93
- end
94
-
95
-
96
- # response_header is a EventMachine::HttpResponseHeader object
97
- # you can use it like this:
98
- # response_header.status
99
- # response_header.cookie
100
- # response_header['Last-Modified']
101
-
102
- def parse_response(file_name, custom_data, response_header)
103
- response_header.status
104
- response_header['Last-Modified']
105
-
106
- #...
107
- end
108
-
109
- # req is a EventMachine::HttpClientOptions object
110
- # you can use it like this:
111
- # req.body
112
- # req.headers
113
- # req.uri
114
- # req.host
115
- # req.port
116
- def parse_response(file_name, custom_data, response_header, req)
117
- puts req.body
118
- puts req.headers
119
- puts req.uri
120
- puts req.host
121
- puts req.port
122
-
123
- #...
124
- end
125
-
126
- ```
127
-
128
- ## And there are many options you can use
129
-
130
- ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
93
+ ## get_list/get_one参数
132
94
  ```
95
+ # down_list: 要请求的TaskStruct数组
96
+ # interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
97
+ # max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
133
98
 
134
- ```ruby
135
- #no concurrent limit (note: only use when list size is small)
136
- ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
-
138
- #sleep random time, often used in site which limit spider
139
- ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
-
141
- #set random time range
142
- ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
-
99
+ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
+ get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
144
101
  ```
145
102
 
146
- ###Options below will take effect in the whole program (set them before call get_list)
103
+ ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
147
104
 
148
105
  ```ruby
149
- #set proxy
150
- ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
-
152
- #set http header (if TaskStruct has header it will be used priority)
153
- ListSpider.set_header_option(header_option)
154
-
155
- #convert the file encoding to utf-8
156
- ListSpider.convert_to_utf8 = true
157
-
158
- #set connect timeout
159
- ListSpider.connect_timeout = 2*60
160
-
161
- #over write exist file
162
- ListSpider.overwrite_exist = false
163
-
164
- #set redirect depth
165
- ListSpider.max_redirects = 10
166
-
106
+ new(href, # 请求链接
107
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
108
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
+ http_method: :get,
110
+ custom_data: nil, # 自定义数据
111
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
112
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
113
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
114
+ # http_req.response_header.status 状态码
115
+ # http_req.response_header 返回头
116
+ # http_req.response 返回体
117
+ callback: nil,
118
+ # 请求失败后的回调
119
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
120
+ errback: nil,
121
+ stream_callback: nil, # 流数据处理回调
122
+ convert_to_utf8: false, # 是否转换为utf8编码
123
+ overwrite_exist: false, # 是否覆盖现有文件
124
+ # 请求设置
125
+ redirects: 3, # 重定向次数
126
+ keepalive: nil, # (暂不支持复用)
127
+ file: nil, # 要上传的文件路径
128
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
129
+ query: nil, # 查询字符串,可以是string或hash类型
130
+ body: nil, # 请求体,可以是string或hash类型
131
+ head: nil, # 请求头
132
+ # 连接设置
133
+ connect_timeout: 60, # 连接超时时间
134
+ inactivity_timeout: nil, # 连接后超时时间
135
+ # ssl设置
136
+ # ssl: {
137
+ # :private_key_file => '/tmp/server.key',
138
+ # :cert_chain_file => '/tmp/server.crt',
139
+ # :verify_peer => false
140
+ # }
141
+ ssl: nil,
142
+ # bind: {
143
+ # :host => '123.123.123.123', # use a specific interface for outbound request
144
+ # :port => '123'
145
+ # }
146
+ bind: nil,
147
+ # 代理设置
148
+ # proxy: {
149
+ # :host => '127.0.0.1', # proxy address
150
+ # :port => 9000, # proxy port
151
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
152
+
153
+ # :authorization => ['user', 'pass'] # proxy authorization header
154
+ # }
155
+ proxy: nil)
167
156
  ```
168
157
 
169
- ## There is a util class to help check or delete unvalid file
158
+ ## 回调函数形式
170
159
 
171
160
  ```ruby
172
- FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
-
174
- FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
-
176
- FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
161
+ # 文件成功保存后调用,通过parse_method参数传入
162
+ def parse_eresponse(task_struct)
163
+ # ...
164
+ end
177
165
 
178
- #params
179
- FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
166
+ # http请求成功后调用,通过callback参数传入
167
+ def call_back(task_struct, http_req)
168
+ # http_req 是EventMachine::HttpRequest对象
169
+ # http_req.response_header.status
170
+ # ...
171
+ end
180
172
 
181
- FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
173
+ # http请求出错后调用,通过errback参数传入
174
+ def err_back(task_struct, http_req)
175
+ # ...
176
+ end
182
177
  ```
183
178
 
184
- ### License
179
+ ## License
185
180
 
186
181
  (MIT License) - Copyright (c) 2016 Charles Zhang
@@ -4,10 +4,16 @@ require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
6
  require 'addressable/uri'
7
- require File.expand_path('../spider_helper', __FILE__)
8
- require File.expand_path('../file_filter', __FILE__)
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
9
 
10
+ # 爬取任务类
10
11
  class TaskStruct
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
11
17
  def initialize(href, # 请求链接
12
18
  local_path, # 保存数据的本地路径(此路径作为去重标准)
13
19
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
@@ -16,9 +22,9 @@ class TaskStruct
16
22
  parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
17
23
  # 请求成功后的回调,此时可能没有保存文件,比如301,404
18
24
  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
19
- # http.response_header.status 状态码
20
- # http.response_header 返回头
21
- # http.response 返回体
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
22
28
  callback: nil,
23
29
  # 请求失败后的回调
24
30
  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
@@ -26,7 +32,7 @@ class TaskStruct
26
32
  stream_callback: nil, # 流数据处理回调
27
33
  convert_to_utf8: false, # 是否转换为utf8编码
28
34
  overwrite_exist: false, # 是否覆盖现有文件
29
- # request options
35
+ # 请求设置
30
36
  redirects: 3, # 重定向次数
31
37
  keepalive: nil, # (暂不支持复用)
32
38
  file: nil, # 要上传的文件路径
@@ -34,7 +40,7 @@ class TaskStruct
34
40
  query: nil, # 查询字符串,可以是string或hash类型
35
41
  body: nil, # 请求体,可以是string或hash类型
36
42
  head: nil, # 请求头
37
- # connection options
43
+ # 连接设置
38
44
  connect_timeout: 60, # 连接超时时间
39
45
  inactivity_timeout: nil, # 连接后超时时间
40
46
  # ssl设置
@@ -112,6 +118,41 @@ module ListSpider
112
118
  @local_path_set = Set.new
113
119
 
114
120
  class << self
121
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
+ if interval.is_a? Range
123
+ @random_time_range = interval
124
+ interval = RANDOM_TIME
125
+ end
126
+
127
+ @down_list = filter_list(down_list)
128
+ @interval = interval
129
+ @max = max
130
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
+ @succeed_size = 0
132
+ @failed_size = 0
133
+
134
+ puts "total size:#{@down_list.size}"
135
+ event_machine_start_list(next_task, method(:complete))
136
+ end
137
+
138
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
+ get_list([task], interval: interval, max: max)
140
+ end
141
+
142
+ def add_task(task)
143
+ if task.is_a? Array
144
+ need_down_list = filter_list(task)
145
+ @down_list += need_down_list
146
+ elsif task.is_a?TaskStruct
147
+ need_down_list = filter_list([task])
148
+ @down_list += need_down_list
149
+ else
150
+ puts "error task type:#{task.class}"
151
+ end
152
+ end
153
+
154
+ private
155
+
115
156
  def event_machine_down(link_struct_list, callback = nil)
116
157
  failed_list = []
117
158
  succeed_list = []
@@ -247,43 +288,6 @@ module ListSpider
247
288
  end
248
289
  need_down_list
249
290
  end
250
-
251
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
252
- if interval.is_a? Range
253
- @random_time_range = interval
254
- interval = RANDOM_TIME
255
- end
256
-
257
- @down_list = []
258
-
259
- need_down_list = filter_list(down_list)
260
-
261
- @down_list += need_down_list
262
- @interval = interval
263
- @max = max
264
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
265
- @succeed_size = 0
266
- @failed_size = 0
267
-
268
- puts "total size:#{@down_list.size}"
269
- event_machine_start_list(next_task, method(:complete))
270
- end
271
-
272
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
273
- get_list([task], interval: interval, max: max)
274
- end
275
-
276
- def add_task(task)
277
- if task.is_a? Array
278
- need_down_list = filter_list(task)
279
- @down_list += need_down_list
280
- elsif task.is_a?TaskStruct
281
- need_down_list = filter_list([task])
282
- @down_list += need_down_list
283
- else
284
- puts "error task type:#{task.class}"
285
- end
286
- end
287
291
  end
288
292
 
289
293
  Signal.trap('INT') do
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '2.0.2'.freeze
2
+ VERSION = '2.1.0'.freeze
3
3
  end
@@ -1,5 +1,5 @@
1
1
 
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'list_spider/version'
5
5
 
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.0'
27
27
 
28
28
  spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
29
+ spec.add_dependency 'nokogiri', '>= 1.8.5'
30
30
  spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
31
  end
@@ -1,5 +1,5 @@
1
- # require 'list_spider'
2
- require File.expand_path('../lib/list_spider', __FILE__)
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
@@ -16,8 +16,6 @@ def parse_index_item(e)
16
16
  end
17
17
  end
18
18
 
19
- # ListSpider.convert_to_utf8 = true
20
-
21
19
  # get_one is a simple function for one taskstruct situation
22
20
  ListSpider.get_one(
23
21
  TaskStruct.new(
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
4
4
 
5
5
  @next_list = []
6
6
 
7
- def parse_index_item(file_name)
8
- content = File.read(file_name)
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
9
  doc = Nokogiri::HTML(content)
10
10
  list_group = doc.css('h2.entry-title')
11
11
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-02-28 00:00:00.000000000 Z
11
+ date: 2019-06-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -62,22 +62,16 @@ dependencies:
62
62
  name: nokogiri
63
63
  requirement: !ruby/object:Gem::Requirement
64
64
  requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '1.6'
68
65
  - - ">="
69
66
  - !ruby/object:Gem::Version
70
- version: 1.6.7
67
+ version: 1.8.5
71
68
  type: :runtime
72
69
  prerelease: false
73
70
  version_requirements: !ruby/object:Gem::Requirement
74
71
  requirements:
75
- - - "~>"
76
- - !ruby/object:Gem::Version
77
- version: '1.6'
78
72
  - - ">="
79
73
  - !ruby/object:Gem::Version
80
- version: 1.6.7
74
+ version: 1.8.5
81
75
  - !ruby/object:Gem::Dependency
82
76
  name: rchardet
83
77
  requirement: !ruby/object:Gem::Requirement
@@ -106,7 +100,9 @@ extensions: []
106
100
  extra_rdoc_files: []
107
101
  files:
108
102
  - ".gitignore"
103
+ - ".rdoc_options"
109
104
  - ".rubocop.yml"
105
+ - English_README.md
110
106
  - Gemfile
111
107
  - Gemfile.lock
112
108
  - README.md
@@ -140,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
136
  - !ruby/object:Gem::Version
141
137
  version: '0'
142
138
  requirements: []
143
- rubyforge_project:
144
- rubygems_version: 2.7.3
139
+ rubygems_version: 3.0.1
145
140
  signing_key:
146
141
  specification_version: 4
147
142
  summary: List Spider