list_spider 2.3.0 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d2fe3c4889a58bacda6fda400b4156c2adf4fbe774bdf9f15596958a1909d890
4
- data.tar.gz: db682834ba36559dfc4bbcb704151b4eddea315b614d1c0ef9aa9d170aa69f58
3
+ metadata.gz: e5cbae2e4b21b6976d8d6edd2a441914ebeb6db4bd6dabadb5ae1e0f9458698f
4
+ data.tar.gz: ebbf650a9eab92b50eeea5217cab6cd3ffa4b79036ebf4de552f11fe4435ee5d
5
5
  SHA512:
6
- metadata.gz: a259eb91eda3257d3be3df839274a4d40a3c11a4dc2fa0b3a57e580cef6aa180f06d27b05a2b2ee30ceb0e4bcbf037f55e36351d2034ae41f6636dcadaa14aa3
7
- data.tar.gz: 9d2062238acf39de5fd8c86d47eb7106640b31ebf87ff42de245a6e911a2efb3e54e7d4592ede162eb248cc33b59e00482781c023a6b1c739b65b703bc8a39e5
6
+ metadata.gz: 88569ec65a348f547d503c7b87a133beaf3066ae459755d4c34a7b3fcb2f73158c9993d95a595af592b451e34c3798f907deed7da2097153d3a021ad7f9088cb
7
+ data.tar.gz: 9fd5d151359f5c19102f6fd2a7679a9fdbdb88576b1b6fbe82950d6f43acff1b7fbd6ecf4eed9f67e6d7e80e793701b04d3469fa7649e6a12e5804ea24626ce4
data/.gitignore CHANGED
@@ -1,84 +1,84 @@
1
- *.gem
2
- *.rbc
3
- /.config
4
- /coverage/
5
- /InstalledFiles
6
- /pkg/
7
- /spec/reports/
8
- /spec/examples.txt
9
- /test/tmp/
10
- /test/version_tmp/
11
- /tmp/
12
-
13
- ## Specific to RubyMotion:
14
- .dat*
15
- .repl_history
16
- build/
17
-
18
- ## Documentation cache and generated files:
19
- /.yardoc/
20
- /_yardoc/
21
- /doc/
22
- /rdoc/
23
-
24
- ## Environment normalisation:
25
- /.bundle/
26
- /vendor/bundle
27
- /lib/bundler/man/
28
-
29
- # for a library or gem, you might want to ignore these files since the code is
30
- # intended to run in multiple environments; otherwise, check them in:
31
- # Gemfile.lock
32
- # .ruby-version
33
- # .ruby-gemset
34
-
35
- # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
- .rvmrc
37
-
38
- .DS_Store
39
- .AppleDouble
40
- .LSOverride
41
-
42
- # Icon must end with two \r
43
- Icon
44
-
45
-
46
- # Thumbnails
47
- ._*
48
-
49
- # Files that might appear in the root of a volume
50
- .DocumentRevisions-V100
51
- .fseventsd
52
- .Spotlight-V100
53
- .TemporaryItems
54
- .Trashes
55
- .VolumeIcon.icns
56
-
57
- # Directories potentially created on remote AFP share
58
- .AppleDB
59
- .AppleDesktop
60
- Network Trash Folder
61
- Temporary Items
62
- .apdisk
63
-
64
- # Windows image file caches
65
- Thumbs.db
66
- ehthumbs.db
67
-
68
- # Folder config file
69
- Desktop.ini
70
-
71
- # Recycle Bin used on file shares
72
- $RECYCLE.BIN/
73
-
74
- # Windows Installer files
75
- *.cab
76
- *.msi
77
- *.msm
78
- *.msp
79
-
80
- # Windows shortcuts
81
- *.lnk
82
-
83
- rubocopresult
84
- coolshell
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ ## Specific to RubyMotion:
14
+ .dat*
15
+ .repl_history
16
+ build/
17
+
18
+ ## Documentation cache and generated files:
19
+ /.yardoc/
20
+ /_yardoc/
21
+ /doc/
22
+ /rdoc/
23
+
24
+ ## Environment normalisation:
25
+ /.bundle/
26
+ /vendor/bundle
27
+ /lib/bundler/man/
28
+
29
+ # for a library or gem, you might want to ignore these files since the code is
30
+ # intended to run in multiple environments; otherwise, check them in:
31
+ # Gemfile.lock
32
+ # .ruby-version
33
+ # .ruby-gemset
34
+
35
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
+ .rvmrc
37
+
38
+ .DS_Store
39
+ .AppleDouble
40
+ .LSOverride
41
+
42
+ # Icon must end with two \r
43
+ Icon
44
+
45
+
46
+ # Thumbnails
47
+ ._*
48
+
49
+ # Files that might appear in the root of a volume
50
+ .DocumentRevisions-V100
51
+ .fseventsd
52
+ .Spotlight-V100
53
+ .TemporaryItems
54
+ .Trashes
55
+ .VolumeIcon.icns
56
+
57
+ # Directories potentially created on remote AFP share
58
+ .AppleDB
59
+ .AppleDesktop
60
+ Network Trash Folder
61
+ Temporary Items
62
+ .apdisk
63
+
64
+ # Windows image file caches
65
+ Thumbs.db
66
+ ehthumbs.db
67
+
68
+ # Folder config file
69
+ Desktop.ini
70
+
71
+ # Recycle Bin used on file shares
72
+ $RECYCLE.BIN/
73
+
74
+ # Windows Installer files
75
+ *.cab
76
+ *.msi
77
+ *.msm
78
+ *.msp
79
+
80
+ # Windows shortcuts
81
+ *.lnk
82
+
83
+ rubocopresult
84
+ coolshell
data/.rdoc_options CHANGED
@@ -1,23 +1,23 @@
1
- --- !ruby/object:RDoc::Options
2
- encoding: UTF-8
3
- static_path: []
4
- rdoc_include:
5
- - "."
6
- - "/Users/zhangchao/github/list_spider"
7
- charset: UTF-8
8
- exclude:
9
- hyperlink_all: false
10
- line_numbers: false
11
- locale:
12
- locale_dir: locale
13
- locale_name:
14
- main_page:
15
- markup: markdown
16
- output_decoration: true
17
- page_dir:
18
- show_hash: false
19
- tab_width: 8
20
- template_stylesheets: []
21
- title:
22
- visibility: :protected
23
- webcvs:
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
data/.rubocop.yml CHANGED
@@ -1,48 +1,48 @@
1
- Metrics/LineLength:
2
- Max: 120
3
- Metrics/MethodLength:
4
- Max: 50
5
- Metrics/ParameterLists:
6
- Max: 12
7
- Metrics/AbcSize:
8
- Max: 50
9
- Metrics/CyclomaticComplexity:
10
- Max: 10
11
- Metrics/PerceivedComplexity:
12
- Max: 10
13
- Style/GuardClause:
14
- MinBodyLength: 5
15
- Style/AsciiComments:
16
- Enabled: false
17
- Style/Documentation:
18
- Enabled: false
19
- Lint/AmbiguousRegexpLiteral:
20
- Enabled: false
21
- Layout/DefEndAlignment:
22
- AutoCorrect: true
23
- Layout/EndAlignment:
24
- AutoCorrect: true
25
- Style/BracesAroundHashParameters:
26
- Enabled: false
27
- Style/ClassAndModuleChildren:
28
- Enabled: false
29
- Style/AutoResourceCleanup:
30
- Enabled: true
31
- Style/CollectionMethods:
32
- Enabled: true
33
- Style/Encoding:
34
- Enabled: true
35
- Style/MethodCalledOnDoEndBlock:
36
- Enabled: true
37
- Layout/MultilineAssignmentLayout:
38
- Enabled: true
39
- Style/OptionHash:
40
- Enabled: true
41
- Style/StringMethods:
42
- Enabled: true
43
- Style/SymbolArray:
44
- Enabled: true
45
- Style/NonNilCheck:
46
- IncludeSemanticChanges: true
47
- Style/Send:
48
- Enabled: true
1
+ Metrics/LineLength:
2
+ Max: 120
3
+ Metrics/MethodLength:
4
+ Max: 50
5
+ Metrics/ParameterLists:
6
+ Max: 12
7
+ Metrics/AbcSize:
8
+ Max: 50
9
+ Metrics/CyclomaticComplexity:
10
+ Max: 10
11
+ Metrics/PerceivedComplexity:
12
+ Max: 10
13
+ Style/GuardClause:
14
+ MinBodyLength: 5
15
+ Style/AsciiComments:
16
+ Enabled: false
17
+ Style/Documentation:
18
+ Enabled: false
19
+ Lint/AmbiguousRegexpLiteral:
20
+ Enabled: false
21
+ Layout/DefEndAlignment:
22
+ AutoCorrect: true
23
+ Layout/EndAlignment:
24
+ AutoCorrect: true
25
+ Style/BracesAroundHashParameters:
26
+ Enabled: false
27
+ Style/ClassAndModuleChildren:
28
+ Enabled: false
29
+ Style/AutoResourceCleanup:
30
+ Enabled: true
31
+ Style/CollectionMethods:
32
+ Enabled: true
33
+ Style/Encoding:
34
+ Enabled: true
35
+ Style/MethodCalledOnDoEndBlock:
36
+ Enabled: true
37
+ Layout/MultilineAssignmentLayout:
38
+ Enabled: true
39
+ Style/OptionHash:
40
+ Enabled: true
41
+ Style/StringMethods:
42
+ Enabled: true
43
+ Style/SymbolArray:
44
+ Enabled: true
45
+ Style/NonNilCheck:
46
+ IncludeSemanticChanges: true
47
+ Style/Send:
48
+ Enabled: true
data/English_README.md CHANGED
@@ -1,169 +1,169 @@
1
- # list_spider
2
-
3
- A url list spider based on em-http-request.
4
-
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
-
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
9
-
10
- * Convert to UTF-8 support.
11
-
12
- * Increased spider support (don't spider exist).
13
-
14
- * Customize concurrent number and interval between task.
15
-
16
- * Http options support.
17
-
18
- ## Getting started
19
-
20
- ```ruby
21
- gem install list_spider
22
- ```
23
-
24
- Or add it to your Gemfile
25
-
26
- ```ruby
27
- gem 'list_spider'
28
- ```
29
-
30
- ## Use like this
31
- ```ruby
32
- require 'list_spider'
33
-
34
- DOWNLOAD_DIR = 'coolshell/'.freeze
35
-
36
- @next_list = []
37
-
38
- def parse_index_item(e)
39
- content = File.read(e.local_path)
40
- doc = Nokogiri::HTML(content)
41
- list_group = doc.css('h2.entry-title')
42
- link_list = list_group.css('a')
43
-
44
- link_list.each do |link|
45
- href = link['href']
46
- local_path = DOWNLOAD_DIR + link.content + '.html'
47
- # or you can save them to database for later use
48
- @next_list << TaskStruct.new(href, local_path)
49
- end
50
- end
51
-
52
- task_list = []
53
- task_list << TaskStruct.new(
54
- 'https://coolshell.cn/',
55
- DOWNLOAD_DIR + 'index.html',
56
- parse_method: method(:parse_index_item)
57
- )
58
-
59
- ListSpider.get_list(task_list)
60
- ListSpider.get_list(@next_list, max: 60)
61
- ```
62
-
63
- ## Or in one step
64
- ```ruby
65
- require 'list_spider'
66
-
67
- DOWNLOAD_DIR = 'coolshell/'.freeze
68
-
69
- def parse_index_item(e)
70
- content = File.read(e.local_path)
71
- doc = Nokogiri::HTML(content)
72
- list_group = doc.css('h2.entry-title')
73
- link_list = list_group.css('a')
74
-
75
- link_list.each do |link|
76
- href = link['href']
77
- local_path = DOWNLOAD_DIR + link.content + '.html'
78
- ListSpider.add_task(TaskStruct.new(href, local_path))
79
- end
80
- end
81
-
82
- # get_one is a simple function for one taskstruct situation
83
- ListSpider.get_one(
84
- TaskStruct.new(
85
- 'https://coolshell.cn/',
86
- DOWNLOAD_DIR + 'index.html',
87
- parse_method: method(:parse_index_item)
88
- ),
89
- max: 60
90
- )
91
- ```
92
-
93
- ## And there are many options you can use
94
-
95
- ```ruby
96
- def initialize(href, # 请求链接
97
- local_path, # 保存数据的本地路径(此路径作为去重标准)
98
- # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
- http_method: :get,
100
- custom_data: nil, # 自定义数据
101
- parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
- # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
- # http.response_header.status 状态码
105
- # http.response_header 返回头
106
- # http.response 返回体
107
- callback: nil,
108
- # 请求失败后的回调
109
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
- errback: nil,
111
- stream_callback: nil, # 流数据处理回调
112
- convert_to_utf8: false, # 是否转换为utf8编码
113
- overwrite_exist: false, # 是否覆盖现有文件
114
- # request options
115
- redirects: 3, # 重定向次数
116
- keepalive: nil, # (暂不支持复用)
117
- file: nil, # 要上传的文件路径
118
- path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
- query: nil, # 查询字符串,可以是string或hash类型
120
- body: nil, # 请求体,可以是string或hash类型
121
- head: nil, # 请求头
122
- # connection options
123
- connect_timeout: 60, # 连接超时时间
124
- inactivity_timeout: nil, # 连接后超时时间
125
- # ssl设置
126
- # ssl: {
127
- # :private_key_file => '/tmp/server.key',
128
- # :cert_chain_file => '/tmp/server.crt',
129
- # :verify_peer => false
130
- # }
131
- ssl: nil,
132
- # bind: {
133
- # :host => '123.123.123.123', # use a specific interface for outbound request
134
- # :port => '123'
135
- # }
136
- bind: nil,
137
- # 代理设置
138
- # proxy: {
139
- # :host => '127.0.0.1', # proxy address
140
- # :port => 9000, # proxy port
141
- # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
-
143
- # :authorization => ['user', 'pass'] # proxy authorization header
144
- # }
145
- proxy: nil)
146
- ```
147
-
148
- ## Callback methods form
149
-
150
- ```ruby
151
- # called when the file is saved successfully
152
- def parse_eresponse(task_struct)
153
- # ...
154
- end
155
-
156
- def call_back(task_struct, http_req)
157
- # http_req is a EventMachine::HttpRequest object
158
- # http_req.response_header.status
159
- # ...
160
- end
161
-
162
- def err_back(task_struct, http_req)
163
- # ...
164
- end
165
- ```
166
-
167
- ### License
168
-
169
- (MIT License) - Copyright (c) 2016 Charles Zhang
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang