list_spider 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a69a63919c971b077855f236631b5b91895e11836338c8970dec65214f92a4c
4
- data.tar.gz: 6e0fefdd9a9eba8bfbead303bfbe2e27a0d98171bd60cdae89864f026289c987
3
+ metadata.gz: 2af55a6c3577dc734aa7ee545cef217059abfc7be4724eaac9cf94126b869b0e
4
+ data.tar.gz: 48e8f116b91e36613b05958f173a1bc168c0c6daa163fd137266515c3a19c2b7
5
5
  SHA512:
6
- metadata.gz: 11a2591f94021e7bb3d06bb7a712f98cf8329142150cf09979d44cc65f1803c809e9077d85ad5d41c83457b20c2bf9694a3a65692898145eb7738e18460e40ba
7
- data.tar.gz: 3e393c808733042ba3b13a950fcc63f7498641a55d16a5ccae11f3bf1100ca04558e32873bd9063602abd6c72194e8d90492b9b292e47cbd08bfc44476f5417f
6
+ metadata.gz: 778ae0918059fd2edea3a02081cf479d054521f216afa789f4d9131708b8339486f39b9f7603e303de91f4c03b1bb7ebf30e6b45ac0921fe0c29640743df9e5d
7
+ data.tar.gz: bcfc6df857085630faf802f3cff9d21653c2d8ced9b2595a3bc92a8093d8883cd470132b770801bc2e4977ad17e5f82aea726a2ad600ab5d1560150dede7c20f
@@ -0,0 +1,23 @@
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
@@ -18,9 +18,9 @@ Style/Documentation:
18
18
  Enabled: false
19
19
  Lint/AmbiguousRegexpLiteral:
20
20
  Enabled: false
21
- Lint/DefEndAlignment:
21
+ Layout/DefEndAlignment:
22
22
  AutoCorrect: true
23
- Lint/EndAlignment:
23
+ Layout/EndAlignment:
24
24
  AutoCorrect: true
25
25
  Style/BracesAroundHashParameters:
26
26
  Enabled: false
@@ -0,0 +1,169 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- list_spider (2.0.1)
4
+ list_spider (2.0.2)
5
5
  em-http-request (~> 1.1, >= 1.1.3)
6
6
  nokogiri (~> 1.6, >= 1.6.7)
7
7
  rchardet (~> 1.6, >= 1.6.1)
data/README.md CHANGED
@@ -1,186 +1,181 @@
1
- # list_spider
1
+ # 关于list_spider
2
2
 
3
- A url list spider based on em-http-request.
3
+ list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
4
4
 
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
5
+ 许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
6
6
 
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
7
+ ## 功能特点
8
+ * 去重过滤 (使用本地文件路径做唯一性校验)
9
9
 
10
- * Convert to UTF-8 support.
10
+ * 支持UTF-8编码转换。
11
11
 
12
- * Increased spider support (don't spider exist).
12
+ * 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
13
13
 
14
- * Customize concurrent number and interval between task.
14
+ * 自由设置最大并发数和爬取任务间隔时间。
15
15
 
16
- * Http options support.
16
+ * 支持http所有选项设置。
17
17
 
18
- ## Getting started
18
+ ## 开始
19
19
 
20
- gem install list_spider
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ 或者添加到Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
21
29
 
22
- ## Use like this
30
+ ## 使用方法
23
31
  ```ruby
24
32
  require 'list_spider'
25
33
 
26
- DOWNLOAD_DIR = 'coolshell/'
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
27
35
 
28
- $next_list = []
36
+ @next_list = []
29
37
 
30
- def parse_index_item(file_name)
31
- content = File.read(file_name)
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
32
40
  doc = Nokogiri::HTML(content)
33
- list_group = doc.css("h2.entry-title")
34
- link_list = list_group.css("a")
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
35
43
 
36
44
  link_list.each do |link|
37
45
  href = link['href']
38
- local_path = DOWNLOAD_DIR + link.content + ".html"
39
- #or you can save them to database for later use
40
- $next_list<< TaskStruct.new(href, local_path)
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # 可以存入数据库后续处理
48
+ @next_list << TaskStruct.new(href, local_path)
41
49
  end
42
50
  end
43
51
 
44
52
  task_list = []
45
- task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
46
58
 
47
59
  ListSpider.get_list(task_list)
48
- ListSpider.get_list($next_list, max: 60)
49
-
60
+ ListSpider.get_list(@next_list, max: 60)
50
61
  ```
51
62
 
52
- ## Or in one step
63
+ ## 或者使用更简单的一步完成
53
64
  ```ruby
54
65
  require 'list_spider'
55
66
 
56
- DOWNLOAD_DIR = 'coolshell/'
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
57
68
 
58
- def parse_index_item(file_name)
59
-
60
- content = File.read(file_name)
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
61
71
  doc = Nokogiri::HTML(content)
62
- list_group = doc.css("h2.entry-title")
63
- link_list = list_group.css("a")
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
64
74
 
65
75
  link_list.each do |link|
66
76
  href = link['href']
67
- local_path = DOWNLOAD_DIR + link.content + ".html"
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
68
78
  ListSpider.add_task(TaskStruct.new(href, local_path))
69
79
  end
70
80
  end
71
81
 
72
- #get_one is a simple function for one taskstruct situation
73
- ListSpider.get_one(TaskStruct.new(
74
- 'https://coolshell.cn/',
75
- DOWNLOAD_DIR + 'index.html',
76
- parse_method: method(:parse_index_item)),
77
- max: 60)
78
-
82
+ # get_one是封装了get_list的简化形式,方便一个任务时调用
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
79
91
  ```
80
92
 
81
- ## You can define parse method in four forms
82
-
83
- ```ruby
84
- def parse_response(file_name)
85
- #...
86
- end
87
-
88
-
89
- # custom_data is passed by TaskStruct's custom_data param
90
-
91
- def parse_response(file_name, custom_data)
92
- #...
93
- end
94
-
95
-
96
- # response_header is a EventMachine::HttpResponseHeader object
97
- # you can use it like this:
98
- # response_header.status
99
- # response_header.cookie
100
- # response_header['Last-Modified']
101
-
102
- def parse_response(file_name, custom_data, response_header)
103
- response_header.status
104
- response_header['Last-Modified']
105
-
106
- #...
107
- end
108
-
109
- # req is a EventMachine::HttpClientOptions object
110
- # you can use it like this:
111
- # req.body
112
- # req.headers
113
- # req.uri
114
- # req.host
115
- # req.port
116
- def parse_response(file_name, custom_data, response_header, req)
117
- puts req.body
118
- puts req.headers
119
- puts req.uri
120
- puts req.host
121
- puts req.port
122
-
123
- #...
124
- end
125
-
126
- ```
127
-
128
- ## And there are many options you can use
129
-
130
- ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
93
+ ## get_list/get_one参数
132
94
  ```
95
+ # down_list: 要请求的TaskStruct数组
96
+ # interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
97
+ # max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
133
98
 
134
- ```ruby
135
- #no concurrent limit (note: only use when list size is small)
136
- ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
-
138
- #sleep random time, often used in site which limit spider
139
- ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
-
141
- #set random time range
142
- ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
-
99
+ get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
100
+ get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
144
101
  ```
145
102
 
146
- ###Options below will take effect in the whole program (set them before call get_list)
103
+ ## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
147
104
 
148
105
  ```ruby
149
- #set proxy
150
- ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
-
152
- #set http header (if TaskStruct has header it will be used priority)
153
- ListSpider.set_header_option(header_option)
154
-
155
- #convert the file encoding to utf-8
156
- ListSpider.convert_to_utf8 = true
157
-
158
- #set connect timeout
159
- ListSpider.connect_timeout = 2*60
160
-
161
- #over write exist file
162
- ListSpider.overwrite_exist = false
163
-
164
- #set redirect depth
165
- ListSpider.max_redirects = 10
166
-
106
+ new(href, # 请求链接
107
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
108
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
109
+ http_method: :get,
110
+ custom_data: nil, # 自定义数据
111
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
112
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
113
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
114
+ # http_req.response_header.status 状态码
115
+ # http_req.response_header 返回头
116
+ # http_req.response 返回体
117
+ callback: nil,
118
+ # 请求失败后的回调
119
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
120
+ errback: nil,
121
+ stream_callback: nil, # 流数据处理回调
122
+ convert_to_utf8: false, # 是否转换为utf8编码
123
+ overwrite_exist: false, # 是否覆盖现有文件
124
+ # 请求设置
125
+ redirects: 3, # 重定向次数
126
+ keepalive: nil, # (暂不支持复用)
127
+ file: nil, # 要上传的文件路径
128
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
129
+ query: nil, # 查询字符串,可以是string或hash类型
130
+ body: nil, # 请求体,可以是string或hash类型
131
+ head: nil, # 请求头
132
+ # 连接设置
133
+ connect_timeout: 60, # 连接超时时间
134
+ inactivity_timeout: nil, # 连接后超时时间
135
+ # ssl设置
136
+ # ssl: {
137
+ # :private_key_file => '/tmp/server.key',
138
+ # :cert_chain_file => '/tmp/server.crt',
139
+ # :verify_peer => false
140
+ # }
141
+ ssl: nil,
142
+ # bind: {
143
+ # :host => '123.123.123.123', # use a specific interface for outbound request
144
+ # :port => '123'
145
+ # }
146
+ bind: nil,
147
+ # 代理设置
148
+ # proxy: {
149
+ # :host => '127.0.0.1', # proxy address
150
+ # :port => 9000, # proxy port
151
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
152
+
153
+ # :authorization => ['user', 'pass'] # proxy authorization header
154
+ # }
155
+ proxy: nil)
167
156
  ```
168
157
 
169
- ## There is a util class to help check or delete unvalid file
158
+ ## 回调函数形式
170
159
 
171
160
  ```ruby
172
- FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
-
174
- FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
-
176
- FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
161
+ # 文件成功保存后调用,通过parse_method参数传入
162
+ def parse_eresponse(task_struct)
163
+ # ...
164
+ end
177
165
 
178
- #params
179
- FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
166
+ # http请求成功后调用,通过callback参数传入
167
+ def call_back(task_struct, http_req)
168
+ # http_req 是EventMachine::HttpRequest对象
169
+ # http_req.response_header.status
170
+ # ...
171
+ end
180
172
 
181
- FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
173
+ # http请求出错后调用,通过errback参数传入
174
+ def err_back(task_struct, http_req)
175
+ # ...
176
+ end
182
177
  ```
183
178
 
184
- ### License
179
+ ## License
185
180
 
186
181
  (MIT License) - Copyright (c) 2016 Charles Zhang
@@ -4,10 +4,16 @@ require 'nokogiri'
4
4
  require 'fileutils'
5
5
  require 'set'
6
6
  require 'addressable/uri'
7
- require File.expand_path('../spider_helper', __FILE__)
8
- require File.expand_path('../file_filter', __FILE__)
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
9
 
10
+ # 爬取任务类
10
11
  class TaskStruct
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
11
17
  def initialize(href, # 请求链接
12
18
  local_path, # 保存数据的本地路径(此路径作为去重标准)
13
19
  # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
@@ -16,9 +22,9 @@ class TaskStruct
16
22
  parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
17
23
  # 请求成功后的回调,此时可能没有保存文件,比如301,404
18
24
  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
19
- # http.response_header.status 状态码
20
- # http.response_header 返回头
21
- # http.response 返回体
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
22
28
  callback: nil,
23
29
  # 请求失败后的回调
24
30
  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
@@ -26,7 +32,7 @@ class TaskStruct
26
32
  stream_callback: nil, # 流数据处理回调
27
33
  convert_to_utf8: false, # 是否转换为utf8编码
28
34
  overwrite_exist: false, # 是否覆盖现有文件
29
- # request options
35
+ # 请求设置
30
36
  redirects: 3, # 重定向次数
31
37
  keepalive: nil, # (暂不支持复用)
32
38
  file: nil, # 要上传的文件路径
@@ -34,7 +40,7 @@ class TaskStruct
34
40
  query: nil, # 查询字符串,可以是string或hash类型
35
41
  body: nil, # 请求体,可以是string或hash类型
36
42
  head: nil, # 请求头
37
- # connection options
43
+ # 连接设置
38
44
  connect_timeout: 60, # 连接超时时间
39
45
  inactivity_timeout: nil, # 连接后超时时间
40
46
  # ssl设置
@@ -112,6 +118,41 @@ module ListSpider
112
118
  @local_path_set = Set.new
113
119
 
114
120
  class << self
121
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
+ if interval.is_a? Range
123
+ @random_time_range = interval
124
+ interval = RANDOM_TIME
125
+ end
126
+
127
+ @down_list = filter_list(down_list)
128
+ @interval = interval
129
+ @max = max
130
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
+ @succeed_size = 0
132
+ @failed_size = 0
133
+
134
+ puts "total size:#{@down_list.size}"
135
+ event_machine_start_list(next_task, method(:complete))
136
+ end
137
+
138
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
+ get_list([task], interval: interval, max: max)
140
+ end
141
+
142
+ def add_task(task)
143
+ if task.is_a? Array
144
+ need_down_list = filter_list(task)
145
+ @down_list += need_down_list
146
+ elsif task.is_a?TaskStruct
147
+ need_down_list = filter_list([task])
148
+ @down_list += need_down_list
149
+ else
150
+ puts "error task type:#{task.class}"
151
+ end
152
+ end
153
+
154
+ private
155
+
115
156
  def event_machine_down(link_struct_list, callback = nil)
116
157
  failed_list = []
117
158
  succeed_list = []
@@ -247,43 +288,6 @@ module ListSpider
247
288
  end
248
289
  need_down_list
249
290
  end
250
-
251
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
252
- if interval.is_a? Range
253
- @random_time_range = interval
254
- interval = RANDOM_TIME
255
- end
256
-
257
- @down_list = []
258
-
259
- need_down_list = filter_list(down_list)
260
-
261
- @down_list += need_down_list
262
- @interval = interval
263
- @max = max
264
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
265
- @succeed_size = 0
266
- @failed_size = 0
267
-
268
- puts "total size:#{@down_list.size}"
269
- event_machine_start_list(next_task, method(:complete))
270
- end
271
-
272
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
273
- get_list([task], interval: interval, max: max)
274
- end
275
-
276
- def add_task(task)
277
- if task.is_a? Array
278
- need_down_list = filter_list(task)
279
- @down_list += need_down_list
280
- elsif task.is_a?TaskStruct
281
- need_down_list = filter_list([task])
282
- @down_list += need_down_list
283
- else
284
- puts "error task type:#{task.class}"
285
- end
286
- end
287
291
  end
288
292
 
289
293
  Signal.trap('INT') do
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '2.0.2'.freeze
2
+ VERSION = '2.1.0'.freeze
3
3
  end
@@ -1,5 +1,5 @@
1
1
 
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'list_spider/version'
5
5
 
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.0'
27
27
 
28
28
  spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
29
+ spec.add_dependency 'nokogiri', '>= 1.8.5'
30
30
  spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
31
  end
@@ -1,5 +1,5 @@
1
- # require 'list_spider'
2
- require File.expand_path('../lib/list_spider', __FILE__)
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
@@ -16,8 +16,6 @@ def parse_index_item(e)
16
16
  end
17
17
  end
18
18
 
19
- # ListSpider.convert_to_utf8 = true
20
-
21
19
  # get_one is a simple function for one taskstruct situation
22
20
  ListSpider.get_one(
23
21
  TaskStruct.new(
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
4
4
 
5
5
  @next_list = []
6
6
 
7
- def parse_index_item(file_name)
8
- content = File.read(file_name)
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
9
  doc = Nokogiri::HTML(content)
10
10
  list_group = doc.css('h2.entry-title')
11
11
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-02-28 00:00:00.000000000 Z
11
+ date: 2019-06-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -62,22 +62,16 @@ dependencies:
62
62
  name: nokogiri
63
63
  requirement: !ruby/object:Gem::Requirement
64
64
  requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '1.6'
68
65
  - - ">="
69
66
  - !ruby/object:Gem::Version
70
- version: 1.6.7
67
+ version: 1.8.5
71
68
  type: :runtime
72
69
  prerelease: false
73
70
  version_requirements: !ruby/object:Gem::Requirement
74
71
  requirements:
75
- - - "~>"
76
- - !ruby/object:Gem::Version
77
- version: '1.6'
78
72
  - - ">="
79
73
  - !ruby/object:Gem::Version
80
- version: 1.6.7
74
+ version: 1.8.5
81
75
  - !ruby/object:Gem::Dependency
82
76
  name: rchardet
83
77
  requirement: !ruby/object:Gem::Requirement
@@ -106,7 +100,9 @@ extensions: []
106
100
  extra_rdoc_files: []
107
101
  files:
108
102
  - ".gitignore"
103
+ - ".rdoc_options"
109
104
  - ".rubocop.yml"
105
+ - English_README.md
110
106
  - Gemfile
111
107
  - Gemfile.lock
112
108
  - README.md
@@ -140,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
136
  - !ruby/object:Gem::Version
141
137
  version: '0'
142
138
  requirements: []
143
- rubyforge_project:
144
- rubygems_version: 2.7.3
139
+ rubygems_version: 3.0.1
145
140
  signing_key:
146
141
  specification_version: 4
147
142
  summary: List Spider