list_spider 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d2fe3c4889a58bacda6fda400b4156c2adf4fbe774bdf9f15596958a1909d890
4
- data.tar.gz: db682834ba36559dfc4bbcb704151b4eddea315b614d1c0ef9aa9d170aa69f58
3
+ metadata.gz: e5cbae2e4b21b6976d8d6edd2a441914ebeb6db4bd6dabadb5ae1e0f9458698f
4
+ data.tar.gz: ebbf650a9eab92b50eeea5217cab6cd3ffa4b79036ebf4de552f11fe4435ee5d
5
5
  SHA512:
6
- metadata.gz: a259eb91eda3257d3be3df839274a4d40a3c11a4dc2fa0b3a57e580cef6aa180f06d27b05a2b2ee30ceb0e4bcbf037f55e36351d2034ae41f6636dcadaa14aa3
7
- data.tar.gz: 9d2062238acf39de5fd8c86d47eb7106640b31ebf87ff42de245a6e911a2efb3e54e7d4592ede162eb248cc33b59e00482781c023a6b1c739b65b703bc8a39e5
6
+ metadata.gz: 88569ec65a348f547d503c7b87a133beaf3066ae459755d4c34a7b3fcb2f73158c9993d95a595af592b451e34c3798f907deed7da2097153d3a021ad7f9088cb
7
+ data.tar.gz: 9fd5d151359f5c19102f6fd2a7679a9fdbdb88576b1b6fbe82950d6f43acff1b7fbd6ecf4eed9f67e6d7e80e793701b04d3469fa7649e6a12e5804ea24626ce4
data/.gitignore CHANGED
@@ -1,84 +1,84 @@
1
- *.gem
2
- *.rbc
3
- /.config
4
- /coverage/
5
- /InstalledFiles
6
- /pkg/
7
- /spec/reports/
8
- /spec/examples.txt
9
- /test/tmp/
10
- /test/version_tmp/
11
- /tmp/
12
-
13
- ## Specific to RubyMotion:
14
- .dat*
15
- .repl_history
16
- build/
17
-
18
- ## Documentation cache and generated files:
19
- /.yardoc/
20
- /_yardoc/
21
- /doc/
22
- /rdoc/
23
-
24
- ## Environment normalisation:
25
- /.bundle/
26
- /vendor/bundle
27
- /lib/bundler/man/
28
-
29
- # for a library or gem, you might want to ignore these files since the code is
30
- # intended to run in multiple environments; otherwise, check them in:
31
- # Gemfile.lock
32
- # .ruby-version
33
- # .ruby-gemset
34
-
35
- # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
- .rvmrc
37
-
38
- .DS_Store
39
- .AppleDouble
40
- .LSOverride
41
-
42
- # Icon must end with two \r
43
- Icon
44
-
45
-
46
- # Thumbnails
47
- ._*
48
-
49
- # Files that might appear in the root of a volume
50
- .DocumentRevisions-V100
51
- .fseventsd
52
- .Spotlight-V100
53
- .TemporaryItems
54
- .Trashes
55
- .VolumeIcon.icns
56
-
57
- # Directories potentially created on remote AFP share
58
- .AppleDB
59
- .AppleDesktop
60
- Network Trash Folder
61
- Temporary Items
62
- .apdisk
63
-
64
- # Windows image file caches
65
- Thumbs.db
66
- ehthumbs.db
67
-
68
- # Folder config file
69
- Desktop.ini
70
-
71
- # Recycle Bin used on file shares
72
- $RECYCLE.BIN/
73
-
74
- # Windows Installer files
75
- *.cab
76
- *.msi
77
- *.msm
78
- *.msp
79
-
80
- # Windows shortcuts
81
- *.lnk
82
-
83
- rubocopresult
84
- coolshell
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ ## Specific to RubyMotion:
14
+ .dat*
15
+ .repl_history
16
+ build/
17
+
18
+ ## Documentation cache and generated files:
19
+ /.yardoc/
20
+ /_yardoc/
21
+ /doc/
22
+ /rdoc/
23
+
24
+ ## Environment normalisation:
25
+ /.bundle/
26
+ /vendor/bundle
27
+ /lib/bundler/man/
28
+
29
+ # for a library or gem, you might want to ignore these files since the code is
30
+ # intended to run in multiple environments; otherwise, check them in:
31
+ # Gemfile.lock
32
+ # .ruby-version
33
+ # .ruby-gemset
34
+
35
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
+ .rvmrc
37
+
38
+ .DS_Store
39
+ .AppleDouble
40
+ .LSOverride
41
+
42
+ # Icon must end with two \r
43
+ Icon
44
+
45
+
46
+ # Thumbnails
47
+ ._*
48
+
49
+ # Files that might appear in the root of a volume
50
+ .DocumentRevisions-V100
51
+ .fseventsd
52
+ .Spotlight-V100
53
+ .TemporaryItems
54
+ .Trashes
55
+ .VolumeIcon.icns
56
+
57
+ # Directories potentially created on remote AFP share
58
+ .AppleDB
59
+ .AppleDesktop
60
+ Network Trash Folder
61
+ Temporary Items
62
+ .apdisk
63
+
64
+ # Windows image file caches
65
+ Thumbs.db
66
+ ehthumbs.db
67
+
68
+ # Folder config file
69
+ Desktop.ini
70
+
71
+ # Recycle Bin used on file shares
72
+ $RECYCLE.BIN/
73
+
74
+ # Windows Installer files
75
+ *.cab
76
+ *.msi
77
+ *.msm
78
+ *.msp
79
+
80
+ # Windows shortcuts
81
+ *.lnk
82
+
83
+ rubocopresult
84
+ coolshell
data/.rdoc_options CHANGED
@@ -1,23 +1,23 @@
1
- --- !ruby/object:RDoc::Options
2
- encoding: UTF-8
3
- static_path: []
4
- rdoc_include:
5
- - "."
6
- - "/Users/zhangchao/github/list_spider"
7
- charset: UTF-8
8
- exclude:
9
- hyperlink_all: false
10
- line_numbers: false
11
- locale:
12
- locale_dir: locale
13
- locale_name:
14
- main_page:
15
- markup: markdown
16
- output_decoration: true
17
- page_dir:
18
- show_hash: false
19
- tab_width: 8
20
- template_stylesheets: []
21
- title:
22
- visibility: :protected
23
- webcvs:
1
+ --- !ruby/object:RDoc::Options
2
+ encoding: UTF-8
3
+ static_path: []
4
+ rdoc_include:
5
+ - "."
6
+ - "/Users/zhangchao/github/list_spider"
7
+ charset: UTF-8
8
+ exclude:
9
+ hyperlink_all: false
10
+ line_numbers: false
11
+ locale:
12
+ locale_dir: locale
13
+ locale_name:
14
+ main_page:
15
+ markup: markdown
16
+ output_decoration: true
17
+ page_dir:
18
+ show_hash: false
19
+ tab_width: 8
20
+ template_stylesheets: []
21
+ title:
22
+ visibility: :protected
23
+ webcvs:
data/.rubocop.yml CHANGED
@@ -1,48 +1,48 @@
1
- Metrics/LineLength:
2
- Max: 120
3
- Metrics/MethodLength:
4
- Max: 50
5
- Metrics/ParameterLists:
6
- Max: 12
7
- Metrics/AbcSize:
8
- Max: 50
9
- Metrics/CyclomaticComplexity:
10
- Max: 10
11
- Metrics/PerceivedComplexity:
12
- Max: 10
13
- Style/GuardClause:
14
- MinBodyLength: 5
15
- Style/AsciiComments:
16
- Enabled: false
17
- Style/Documentation:
18
- Enabled: false
19
- Lint/AmbiguousRegexpLiteral:
20
- Enabled: false
21
- Layout/DefEndAlignment:
22
- AutoCorrect: true
23
- Layout/EndAlignment:
24
- AutoCorrect: true
25
- Style/BracesAroundHashParameters:
26
- Enabled: false
27
- Style/ClassAndModuleChildren:
28
- Enabled: false
29
- Style/AutoResourceCleanup:
30
- Enabled: true
31
- Style/CollectionMethods:
32
- Enabled: true
33
- Style/Encoding:
34
- Enabled: true
35
- Style/MethodCalledOnDoEndBlock:
36
- Enabled: true
37
- Layout/MultilineAssignmentLayout:
38
- Enabled: true
39
- Style/OptionHash:
40
- Enabled: true
41
- Style/StringMethods:
42
- Enabled: true
43
- Style/SymbolArray:
44
- Enabled: true
45
- Style/NonNilCheck:
46
- IncludeSemanticChanges: true
47
- Style/Send:
48
- Enabled: true
1
+ Metrics/LineLength:
2
+ Max: 120
3
+ Metrics/MethodLength:
4
+ Max: 50
5
+ Metrics/ParameterLists:
6
+ Max: 12
7
+ Metrics/AbcSize:
8
+ Max: 50
9
+ Metrics/CyclomaticComplexity:
10
+ Max: 10
11
+ Metrics/PerceivedComplexity:
12
+ Max: 10
13
+ Style/GuardClause:
14
+ MinBodyLength: 5
15
+ Style/AsciiComments:
16
+ Enabled: false
17
+ Style/Documentation:
18
+ Enabled: false
19
+ Lint/AmbiguousRegexpLiteral:
20
+ Enabled: false
21
+ Layout/DefEndAlignment:
22
+ AutoCorrect: true
23
+ Layout/EndAlignment:
24
+ AutoCorrect: true
25
+ Style/BracesAroundHashParameters:
26
+ Enabled: false
27
+ Style/ClassAndModuleChildren:
28
+ Enabled: false
29
+ Style/AutoResourceCleanup:
30
+ Enabled: true
31
+ Style/CollectionMethods:
32
+ Enabled: true
33
+ Style/Encoding:
34
+ Enabled: true
35
+ Style/MethodCalledOnDoEndBlock:
36
+ Enabled: true
37
+ Layout/MultilineAssignmentLayout:
38
+ Enabled: true
39
+ Style/OptionHash:
40
+ Enabled: true
41
+ Style/StringMethods:
42
+ Enabled: true
43
+ Style/SymbolArray:
44
+ Enabled: true
45
+ Style/NonNilCheck:
46
+ IncludeSemanticChanges: true
47
+ Style/Send:
48
+ Enabled: true
data/English_README.md CHANGED
@@ -1,169 +1,169 @@
1
- # list_spider
2
-
3
- A url list spider based on em-http-request.
4
-
5
- Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
-
7
- ## Features
8
- * Duplicate url filtering (based on local path, so you can custom your behavior).
9
-
10
- * Convert to UTF-8 support.
11
-
12
- * Increased spider support (don't spider exist).
13
-
14
- * Customize concurrent number and interval between task.
15
-
16
- * Http options support.
17
-
18
- ## Getting started
19
-
20
- ```ruby
21
- gem install list_spider
22
- ```
23
-
24
- Or add it to your Gemfile
25
-
26
- ```ruby
27
- gem 'list_spider'
28
- ```
29
-
30
- ## Use like this
31
- ```ruby
32
- require 'list_spider'
33
-
34
- DOWNLOAD_DIR = 'coolshell/'.freeze
35
-
36
- @next_list = []
37
-
38
- def parse_index_item(e)
39
- content = File.read(e.local_path)
40
- doc = Nokogiri::HTML(content)
41
- list_group = doc.css('h2.entry-title')
42
- link_list = list_group.css('a')
43
-
44
- link_list.each do |link|
45
- href = link['href']
46
- local_path = DOWNLOAD_DIR + link.content + '.html'
47
- # or you can save them to database for later use
48
- @next_list << TaskStruct.new(href, local_path)
49
- end
50
- end
51
-
52
- task_list = []
53
- task_list << TaskStruct.new(
54
- 'https://coolshell.cn/',
55
- DOWNLOAD_DIR + 'index.html',
56
- parse_method: method(:parse_index_item)
57
- )
58
-
59
- ListSpider.get_list(task_list)
60
- ListSpider.get_list(@next_list, max: 60)
61
- ```
62
-
63
- ## Or in one step
64
- ```ruby
65
- require 'list_spider'
66
-
67
- DOWNLOAD_DIR = 'coolshell/'.freeze
68
-
69
- def parse_index_item(e)
70
- content = File.read(e.local_path)
71
- doc = Nokogiri::HTML(content)
72
- list_group = doc.css('h2.entry-title')
73
- link_list = list_group.css('a')
74
-
75
- link_list.each do |link|
76
- href = link['href']
77
- local_path = DOWNLOAD_DIR + link.content + '.html'
78
- ListSpider.add_task(TaskStruct.new(href, local_path))
79
- end
80
- end
81
-
82
- # get_one is a simple function for one taskstruct situation
83
- ListSpider.get_one(
84
- TaskStruct.new(
85
- 'https://coolshell.cn/',
86
- DOWNLOAD_DIR + 'index.html',
87
- parse_method: method(:parse_index_item)
88
- ),
89
- max: 60
90
- )
91
- ```
92
-
93
- ## And there are many options you can use
94
-
95
- ```ruby
96
- def initialize(href, # 请求链接
97
- local_path, # 保存数据的本地路径(此路径作为去重标准)
98
- # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
- http_method: :get,
100
- custom_data: nil, # 自定义数据
101
- parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
- # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
- # http.response_header.status 状态码
105
- # http.response_header 返回头
106
- # http.response 返回体
107
- callback: nil,
108
- # 请求失败后的回调
109
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
- errback: nil,
111
- stream_callback: nil, # 流数据处理回调
112
- convert_to_utf8: false, # 是否转换为utf8编码
113
- overwrite_exist: false, # 是否覆盖现有文件
114
- # request options
115
- redirects: 3, # 重定向次数
116
- keepalive: nil, # (暂不支持复用)
117
- file: nil, # 要上传的文件路径
118
- path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
- query: nil, # 查询字符串,可以是string或hash类型
120
- body: nil, # 请求体,可以是string或hash类型
121
- head: nil, # 请求头
122
- # connection options
123
- connect_timeout: 60, # 连接超时时间
124
- inactivity_timeout: nil, # 连接后超时时间
125
- # ssl设置
126
- # ssl: {
127
- # :private_key_file => '/tmp/server.key',
128
- # :cert_chain_file => '/tmp/server.crt',
129
- # :verify_peer => false
130
- # }
131
- ssl: nil,
132
- # bind: {
133
- # :host => '123.123.123.123', # use a specific interface for outbound request
134
- # :port => '123'
135
- # }
136
- bind: nil,
137
- # 代理设置
138
- # proxy: {
139
- # :host => '127.0.0.1', # proxy address
140
- # :port => 9000, # proxy port
141
- # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
-
143
- # :authorization => ['user', 'pass'] # proxy authorization header
144
- # }
145
- proxy: nil)
146
- ```
147
-
148
- ## Callback methods form
149
-
150
- ```ruby
151
- # called when the file is saved successfully
152
- def parse_eresponse(task_struct)
153
- # ...
154
- end
155
-
156
- def call_back(task_struct, http_req)
157
- # http_req is a EventMachine::HttpRequest object
158
- # http_req.response_header.status
159
- # ...
160
- end
161
-
162
- def err_back(task_struct, http_req)
163
- # ...
164
- end
165
- ```
166
-
167
- ### License
168
-
169
- (MIT License) - Copyright (c) 2016 Charles Zhang
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ ```ruby
21
+ gem install list_spider
22
+ ```
23
+
24
+ Or add it to your Gemfile
25
+
26
+ ```ruby
27
+ gem 'list_spider'
28
+ ```
29
+
30
+ ## Use like this
31
+ ```ruby
32
+ require 'list_spider'
33
+
34
+ DOWNLOAD_DIR = 'coolshell/'.freeze
35
+
36
+ @next_list = []
37
+
38
+ def parse_index_item(e)
39
+ content = File.read(e.local_path)
40
+ doc = Nokogiri::HTML(content)
41
+ list_group = doc.css('h2.entry-title')
42
+ link_list = list_group.css('a')
43
+
44
+ link_list.each do |link|
45
+ href = link['href']
46
+ local_path = DOWNLOAD_DIR + link.content + '.html'
47
+ # or you can save them to database for later use
48
+ @next_list << TaskStruct.new(href, local_path)
49
+ end
50
+ end
51
+
52
+ task_list = []
53
+ task_list << TaskStruct.new(
54
+ 'https://coolshell.cn/',
55
+ DOWNLOAD_DIR + 'index.html',
56
+ parse_method: method(:parse_index_item)
57
+ )
58
+
59
+ ListSpider.get_list(task_list)
60
+ ListSpider.get_list(@next_list, max: 60)
61
+ ```
62
+
63
+ ## Or in one step
64
+ ```ruby
65
+ require 'list_spider'
66
+
67
+ DOWNLOAD_DIR = 'coolshell/'.freeze
68
+
69
+ def parse_index_item(e)
70
+ content = File.read(e.local_path)
71
+ doc = Nokogiri::HTML(content)
72
+ list_group = doc.css('h2.entry-title')
73
+ link_list = list_group.css('a')
74
+
75
+ link_list.each do |link|
76
+ href = link['href']
77
+ local_path = DOWNLOAD_DIR + link.content + '.html'
78
+ ListSpider.add_task(TaskStruct.new(href, local_path))
79
+ end
80
+ end
81
+
82
+ # get_one is a simple function for one taskstruct situation
83
+ ListSpider.get_one(
84
+ TaskStruct.new(
85
+ 'https://coolshell.cn/',
86
+ DOWNLOAD_DIR + 'index.html',
87
+ parse_method: method(:parse_index_item)
88
+ ),
89
+ max: 60
90
+ )
91
+ ```
92
+
93
+ ## And there are many options you can use
94
+
95
+ ```ruby
96
+ def initialize(href, # 请求链接
97
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
98
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
99
+ http_method: :get,
100
+ custom_data: nil, # 自定义数据
101
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
102
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
103
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
104
+ # http.response_header.status 状态码
105
+ # http.response_header 返回头
106
+ # http.response 返回体
107
+ callback: nil,
108
+ # 请求失败后的回调
109
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
110
+ errback: nil,
111
+ stream_callback: nil, # 流数据处理回调
112
+ convert_to_utf8: false, # 是否转换为utf8编码
113
+ overwrite_exist: false, # 是否覆盖现有文件
114
+ # request options
115
+ redirects: 3, # 重定向次数
116
+ keepalive: nil, # (暂不支持复用)
117
+ file: nil, # 要上传的文件路径
118
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
119
+ query: nil, # 查询字符串,可以是string或hash类型
120
+ body: nil, # 请求体,可以是string或hash类型
121
+ head: nil, # 请求头
122
+ # connection options
123
+ connect_timeout: 60, # 连接超时时间
124
+ inactivity_timeout: nil, # 连接后超时时间
125
+ # ssl设置
126
+ # ssl: {
127
+ # :private_key_file => '/tmp/server.key',
128
+ # :cert_chain_file => '/tmp/server.crt',
129
+ # :verify_peer => false
130
+ # }
131
+ ssl: nil,
132
+ # bind: {
133
+ # :host => '123.123.123.123', # use a specific interface for outbound request
134
+ # :port => '123'
135
+ # }
136
+ bind: nil,
137
+ # 代理设置
138
+ # proxy: {
139
+ # :host => '127.0.0.1', # proxy address
140
+ # :port => 9000, # proxy port
141
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
142
+
143
+ # :authorization => ['user', 'pass'] # proxy authorization header
144
+ # }
145
+ proxy: nil)
146
+ ```
147
+
148
+ ## Callback methods form
149
+
150
+ ```ruby
151
+ # called when the file is saved successfully
152
+ def parse_eresponse(task_struct)
153
+ # ...
154
+ end
155
+
156
+ def call_back(task_struct, http_req)
157
+ # http_req is a EventMachine::HttpRequest object
158
+ # http_req.response_header.status
159
+ # ...
160
+ end
161
+
162
+ def err_back(task_struct, http_req)
163
+ # ...
164
+ end
165
+ ```
166
+
167
+ ### License
168
+
169
+ (MIT License) - Copyright (c) 2016 Charles Zhang