list_spider 2.3.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
- module ListSpider
2
- VERSION = '2.3.0'.freeze
3
- end
1
+ module ListSpider
2
+ VERSION = '2.8.0'.freeze
3
+ end
@@ -1,110 +1,110 @@
1
- require 'rchardet'
2
- require 'net/http'
3
-
4
- module SpiderHelper
5
- class << self
6
- def direct_http_get(href, local_path, params: nil,
7
- header: nil, convert_to_utf8: false)
8
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
-
10
- begin
11
- href.query = URI.encode_www_form(params) if params
12
- req = Net::HTTP::Get.new(href)
13
- header.each { |k, v| req[k] = v } if header
14
-
15
- res =
16
- Net::HTTP.start(href.hostname, href.port) do |http|
17
- http.request(req)
18
- end
19
-
20
- if res.is_a?(Net::HTTPSuccess)
21
- local_dir = File.dirname(local_path)
22
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
- content = res.body
24
- content = to_utf8(content) if convert_to_utf8
25
- File.write(local_path, content)
26
- puts 'succeed'
27
- return true
28
- else
29
- puts res
30
- end
31
- rescue StandardError => e
32
- puts e.backtrace
33
- puts e
34
- false
35
- end
36
- false
37
- end
38
-
39
- def direct_http_post(href, local_path, params,
40
- header: nil, convert_to_utf8: false)
41
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
-
43
- begin
44
- req = Net::HTTP::Post.new(href)
45
- req.set_form_data(params)
46
- header.each { |k, v| req[k] = v } if header
47
-
48
- res =
49
- Net::HTTP.start(href.hostname, href.port) do |http|
50
- http.request(req)
51
- end
52
-
53
- if res.is_a?(Net::HTTPSuccess)
54
- local_dir = File.dirname(local_path)
55
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
- content = res.body
57
- content = to_utf8(content) if convert_to_utf8
58
- File.write(local_path, content)
59
- puts 'succeed'
60
- return true
61
- else
62
- puts res
63
- end
64
- rescue StandardError => e
65
- puts e
66
- false
67
- end
68
- false
69
- end
70
-
71
- def extract_href_last(origin_href)
72
- origin_href.split('/')[-1]
73
- end
74
-
75
- def string_to_uri(href)
76
- l = href
77
- l.sub!('http:///', 'http://')
78
- l = Addressable::URI.parse(l)
79
- l.normalize!
80
- end
81
-
82
- BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
- 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
- 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
- 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
- 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
-
88
- # 此函数有时此判断有误,使用to_utf8函数直接转换
89
- def smart_to_utf8(str)
90
- return str if str.encoding == Encoding::UTF_8
91
- to_utf8(str)
92
- end
93
-
94
- def to_utf8(str)
95
- # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
- str.force_encoding(Encoding::ASCII_8BIT)
97
- cd = CharDet.detect(str)
98
- if cd['confidence'] > 0.6
99
- puts cd['encoding']
100
- str.force_encoding(cd['encoding'])
101
- # 移除BOM头
102
- bom_header = BomHeaderMap[cd['encoding']]
103
- str.sub!(bom_header, '') if bom_header
104
- end
105
- str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
-
107
- str
108
- end
109
- end
110
- end
1
+ require 'rchardet'
2
+ require 'net/http'
3
+
4
+ module SpiderHelper
5
+ class << self
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
+
10
+ begin
11
+ href.query = URI.encode_www_form(params) if params
12
+ req = Net::HTTP::Get.new(href)
13
+ header.each { |k, v| req[k] = v } if header
14
+
15
+ res =
16
+ Net::HTTP.start(href.hostname, href.port) do |http|
17
+ http.request(req)
18
+ end
19
+
20
+ if res.is_a?(Net::HTTPSuccess)
21
+ local_dir = File.dirname(local_path)
22
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
+ content = res.body
24
+ content = to_utf8(content) if convert_to_utf8
25
+ File.write(local_path, content)
26
+ puts 'succeed'
27
+ return true
28
+ else
29
+ puts res
30
+ end
31
+ rescue StandardError => e
32
+ puts e.backtrace
33
+ puts e
34
+ false
35
+ end
36
+ false
37
+ end
38
+
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
+
43
+ begin
44
+ req = Net::HTTP::Post.new(href)
45
+ req.set_form_data(params)
46
+ header.each { |k, v| req[k] = v } if header
47
+
48
+ res =
49
+ Net::HTTP.start(href.hostname, href.port) do |http|
50
+ http.request(req)
51
+ end
52
+
53
+ if res.is_a?(Net::HTTPSuccess)
54
+ local_dir = File.dirname(local_path)
55
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
+ content = res.body
57
+ content = to_utf8(content) if convert_to_utf8
58
+ File.write(local_path, content)
59
+ puts 'succeed'
60
+ return true
61
+ else
62
+ puts res
63
+ end
64
+ rescue StandardError => e
65
+ puts e
66
+ false
67
+ end
68
+ false
69
+ end
70
+
71
+ def extract_href_last(origin_href)
72
+ origin_href.split('/')[-1]
73
+ end
74
+
75
+ def string_to_uri(href)
76
+ l = href
77
+ l.sub!('http:///', 'http://')
78
+ l = Addressable::URI.parse(l)
79
+ l.normalize!
80
+ end
81
+
82
+ BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
+ 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
+ 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
+ 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
+ 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
+
88
+ # 此函数有时此判断有误,使用to_utf8函数直接转换
89
+ def smart_to_utf8(str)
90
+ return str if str.encoding == Encoding::UTF_8
91
+ to_utf8(str)
92
+ end
93
+
94
+ def to_utf8(str)
95
+ # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
+ str.force_encoding(Encoding::ASCII_8BIT)
97
+ cd = CharDet.detect(str)
98
+ if cd['confidence'] > 0.6
99
+ puts cd['encoding']
100
+ str.force_encoding(cd['encoding'])
101
+ # 移除BOM头
102
+ bom_header = BomHeaderMap[cd['encoding']]
103
+ str.sub!(bom_header, '') if bom_header
104
+ end
105
+ str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
+
107
+ str
108
+ end
109
+ end
110
+ end
@@ -1,31 +1,31 @@
1
-
2
- lib = File.expand_path('lib', __dir__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'list_spider/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = 'list_spider'
8
- spec.version = ListSpider::VERSION
9
- spec.authors = ['Charles Zhang']
10
- spec.email = ['gis05zc@163.com']
11
-
12
- spec.summary = 'List Spider'
13
- spec.description = 'A url list spider based on em-http-request.'
14
- spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
- spec.license = 'MIT'
16
-
17
- spec.files =
18
- `git ls-files -z`.split("\x0").reject do |f|
19
- f.match(%r{^(test|spec|features)/})
20
- end
21
- spec.bindir = 'exe'
22
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
- spec.require_paths = ['lib']
24
-
25
- spec.add_development_dependency 'bundler', '~> 1.16'
26
- spec.add_development_dependency 'rake', '~> 10.0'
27
-
28
- spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.10'
30
- spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
- end
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'list_spider'
8
+ spec.version = ListSpider::VERSION
9
+ spec.authors = ['Charles Zhang']
10
+ spec.email = ['gis05zc@163.com']
11
+
12
+ spec.summary = 'List Spider'
13
+ spec.description = 'A url list spider based on em-http-request.'
14
+ spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files =
18
+ `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = 'exe'
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.16'
26
+ spec.add_development_dependency 'rake', '>= 12.3.3'
27
+
28
+ spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
+ spec.add_dependency 'nokogiri', '>= 1.10.8'
30
+ spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
+ end
@@ -1,27 +1,27 @@
1
- require 'list_spider'
2
- # require File.expand_path('../lib/list_spider', __FILE__)
3
-
4
- DOWNLOAD_DIR = 'coolshell/'.freeze
5
-
6
- def parse_index_item(e)
7
- content = File.read(e.local_path)
8
- doc = Nokogiri::HTML(content)
9
- list_group = doc.css('h2.entry-title')
10
- link_list = list_group.css('a')
11
-
12
- link_list.each do |link|
13
- href = link['href']
14
- local_path = DOWNLOAD_DIR + link.content + '.html'
15
- ListSpider.add_task(TaskStruct.new(href, local_path))
16
- end
17
- end
18
-
19
- # get_one is a simple function for one taskstruct situation
20
- ListSpider.get_one(
21
- TaskStruct.new(
22
- 'https://coolshell.cn/',
23
- DOWNLOAD_DIR + 'index.html',
24
- parse_method: method(:parse_index_item)
25
- ),
26
- max: 60
27
- )
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ DOWNLOAD_DIR = 'coolshell/'.freeze
5
+
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ local_path = DOWNLOAD_DIR + link.content + '.html'
15
+ ListSpider.add_task(TaskStruct.new(href, local_path))
16
+ end
17
+ end
18
+
19
+ # get_one is a simple function for one taskstruct situation
20
+ ListSpider.get_one(
21
+ TaskStruct.new(
22
+ 'https://coolshell.cn/',
23
+ DOWNLOAD_DIR + 'index.html',
24
+ parse_method: method(:parse_index_item)
25
+ ),
26
+ max: 60
27
+ )
@@ -1,29 +1,29 @@
1
- require 'list_spider'
2
-
3
- DOWNLOAD_DIR = 'coolshell/'.freeze
4
-
5
- @next_list = []
6
-
7
- def parse_index_item(e)
8
- content = File.read(e.local_path)
9
- doc = Nokogiri::HTML(content)
10
- list_group = doc.css('h2.entry-title')
11
- link_list = list_group.css('a')
12
-
13
- link_list.each do |link|
14
- href = link['href']
15
- local_path = DOWNLOAD_DIR + link.content + '.html'
16
- # or you can save them to database for later use
17
- @next_list << TaskStruct.new(href, local_path)
18
- end
19
- end
20
-
21
- task_list = []
22
- task_list << TaskStruct.new(
23
- 'https://coolshell.cn/',
24
- DOWNLOAD_DIR + 'index.html',
25
- parse_method: method(:parse_index_item)
26
- )
27
-
28
- ListSpider.get_list(task_list)
29
- ListSpider.get_list(@next_list, max: 60)
1
+ require 'list_spider'
2
+
3
+ DOWNLOAD_DIR = 'coolshell/'.freeze
4
+
5
+ @next_list = []
6
+
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
+ doc = Nokogiri::HTML(content)
10
+ list_group = doc.css('h2.entry-title')
11
+ link_list = list_group.css('a')
12
+
13
+ link_list.each do |link|
14
+ href = link['href']
15
+ local_path = DOWNLOAD_DIR + link.content + '.html'
16
+ # or you can save them to database for later use
17
+ @next_list << TaskStruct.new(href, local_path)
18
+ end
19
+ end
20
+
21
+ task_list = []
22
+ task_list << TaskStruct.new(
23
+ 'https://coolshell.cn/',
24
+ DOWNLOAD_DIR + 'index.html',
25
+ parse_method: method(:parse_index_item)
26
+ )
27
+
28
+ ListSpider.get_list(task_list)
29
+ ListSpider.get_list(@next_list, max: 60)
@@ -0,0 +1,34 @@
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ def call_back(task_struct, http_req)
5
+ puts "succeed"
6
+ puts http_req.response_header.status
7
+ content = http_req.response
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ ListSpider.add_task(TaskStruct.new(href,
15
+ callback: method(:call_back),
16
+ errback: method(:err_back)))
17
+ end
18
+ end
19
+
20
+ def err_back(task_struct, http_req)
21
+ puts "failed"
22
+ puts http_req.response_header.status
23
+ end
24
+
25
+ ListSpider.save_file = false
26
+
27
+ # get_one is a simple function for one taskstruct situation
28
+ ListSpider.get_one(
29
+ TaskStruct.new(
30
+ 'https://coolshell.cn/',
31
+ callback: method(:call_back),
32
+ errback: method(:err_back)
33
+ )
34
+ )
metadata CHANGED
@@ -1,24 +1,24 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-06 00:00:00.000000000 Z
11
+ date: 2021-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
+ prerelease: false
15
16
  requirement: !ruby/object:Gem::Requirement
16
17
  requirements:
17
18
  - - "~>"
18
19
  - !ruby/object:Gem::Version
19
20
  version: '1.16'
20
21
  type: :development
21
- prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
@@ -26,20 +26,21 @@ dependencies:
26
26
  version: '1.16'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
+ prerelease: false
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
- - - "~>"
32
+ - - ">="
32
33
  - !ruby/object:Gem::Version
33
- version: '10.0'
34
+ version: 12.3.3
34
35
  type: :development
35
- prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: em-http-request
43
+ prerelease: false
43
44
  requirement: !ruby/object:Gem::Requirement
44
45
  requirements:
45
46
  - - "~>"
@@ -49,7 +50,6 @@ dependencies:
49
50
  - !ruby/object:Gem::Version
50
51
  version: 1.1.3
51
52
  type: :runtime
52
- prerelease: false
53
53
  version_requirements: !ruby/object:Gem::Requirement
54
54
  requirements:
55
55
  - - "~>"
@@ -60,20 +60,21 @@ dependencies:
60
60
  version: 1.1.3
61
61
  - !ruby/object:Gem::Dependency
62
62
  name: nokogiri
63
+ prerelease: false
63
64
  requirement: !ruby/object:Gem::Requirement
64
65
  requirements:
65
- - - "~>"
66
+ - - ">="
66
67
  - !ruby/object:Gem::Version
67
- version: '1.10'
68
+ version: 1.10.8
68
69
  type: :runtime
69
- prerelease: false
70
70
  version_requirements: !ruby/object:Gem::Requirement
71
71
  requirements:
72
- - - "~>"
72
+ - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: '1.10'
74
+ version: 1.10.8
75
75
  - !ruby/object:Gem::Dependency
76
76
  name: rchardet
77
+ prerelease: false
77
78
  requirement: !ruby/object:Gem::Requirement
78
79
  requirements:
79
80
  - - "~>"
@@ -83,7 +84,6 @@ dependencies:
83
84
  - !ruby/object:Gem::Version
84
85
  version: 1.6.1
85
86
  type: :runtime
86
- prerelease: false
87
87
  version_requirements: !ruby/object:Gem::Requirement
88
88
  requirements:
89
89
  - - "~>"
@@ -117,6 +117,7 @@ files:
117
117
  - list_spider.gemspec
118
118
  - spider_example.rb
119
119
  - spider_example_2.rb
120
+ - spider_not_save_file.rb
120
121
  homepage: https://github.com/chinazhangchao/list_spider
121
122
  licenses:
122
123
  - MIT
@@ -136,8 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
136
137
  - !ruby/object:Gem::Version
137
138
  version: '0'
138
139
  requirements: []
139
- rubyforge_project:
140
- rubygems_version: 2.7.6
140
+ rubygems_version: 3.0.3
141
141
  signing_key:
142
142
  specification_version: 4
143
143
  summary: List Spider