list_spider 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
- module ListSpider
2
- VERSION = '2.2.0'.freeze
3
- end
1
+ module ListSpider
2
+ VERSION = '2.3.0'.freeze
3
+ end
@@ -1,110 +1,110 @@
1
- require 'rchardet'
2
- require 'net/http'
3
-
4
- module SpiderHelper
5
- class << self
6
- def direct_http_get(href, local_path, params: nil,
7
- header: nil, convert_to_utf8: false)
8
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
-
10
- begin
11
- href.query = URI.encode_www_form(params) if params
12
- req = Net::HTTP::Get.new(href)
13
- header.each { |k, v| req[k] = v } if header
14
-
15
- res =
16
- Net::HTTP.start(href.hostname, href.port) do |http|
17
- http.request(req)
18
- end
19
-
20
- if res.is_a?(Net::HTTPSuccess)
21
- local_dir = File.dirname(local_path)
22
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
- content = res.body
24
- content = to_utf8(content) if convert_to_utf8
25
- File.write(local_path, content)
26
- puts 'succeed'
27
- return true
28
- else
29
- puts res
30
- end
31
- rescue StandardError => e
32
- puts e.backtrace
33
- puts e
34
- false
35
- end
36
- false
37
- end
38
-
39
- def direct_http_post(href, local_path, params,
40
- header: nil, convert_to_utf8: false)
41
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
-
43
- begin
44
- req = Net::HTTP::Post.new(href)
45
- req.set_form_data(params)
46
- header.each { |k, v| req[k] = v } if header
47
-
48
- res =
49
- Net::HTTP.start(href.hostname, href.port) do |http|
50
- http.request(req)
51
- end
52
-
53
- if res.is_a?(Net::HTTPSuccess)
54
- local_dir = File.dirname(local_path)
55
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
- content = res.body
57
- content = to_utf8(content) if convert_to_utf8
58
- File.write(local_path, content)
59
- puts 'succeed'
60
- return true
61
- else
62
- puts res
63
- end
64
- rescue StandardError => e
65
- puts e
66
- false
67
- end
68
- false
69
- end
70
-
71
- def extract_href_last(origin_href)
72
- origin_href.split('/')[-1]
73
- end
74
-
75
- def string_to_uri(href)
76
- l = href
77
- l.sub!('http:///', 'http://')
78
- l = Addressable::URI.parse(l)
79
- l.normalize!
80
- end
81
-
82
- BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
- 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
- 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
- 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
- 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
-
88
- # 此函数有时此判断有误,使用to_utf8函数直接转换
89
- def smart_to_utf8(str)
90
- return str if str.encoding == Encoding::UTF_8
91
- to_utf8(str)
92
- end
93
-
94
- def to_utf8(str)
95
- # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
- str.force_encoding(Encoding::ASCII_8BIT)
97
- cd = CharDet.detect(str)
98
- if cd['confidence'] > 0.6
99
- puts cd['encoding']
100
- str.force_encoding(cd['encoding'])
101
- # 移除BOM头
102
- bom_header = BomHeaderMap[cd['encoding']]
103
- str.sub!(bom_header, '') if bom_header
104
- end
105
- str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
-
107
- str
108
- end
109
- end
110
- end
1
+ require 'rchardet'
2
+ require 'net/http'
3
+
4
+ module SpiderHelper
5
+ class << self
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
+
10
+ begin
11
+ href.query = URI.encode_www_form(params) if params
12
+ req = Net::HTTP::Get.new(href)
13
+ header.each { |k, v| req[k] = v } if header
14
+
15
+ res =
16
+ Net::HTTP.start(href.hostname, href.port) do |http|
17
+ http.request(req)
18
+ end
19
+
20
+ if res.is_a?(Net::HTTPSuccess)
21
+ local_dir = File.dirname(local_path)
22
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
+ content = res.body
24
+ content = to_utf8(content) if convert_to_utf8
25
+ File.write(local_path, content)
26
+ puts 'succeed'
27
+ return true
28
+ else
29
+ puts res
30
+ end
31
+ rescue StandardError => e
32
+ puts e.backtrace
33
+ puts e
34
+ false
35
+ end
36
+ false
37
+ end
38
+
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
+
43
+ begin
44
+ req = Net::HTTP::Post.new(href)
45
+ req.set_form_data(params)
46
+ header.each { |k, v| req[k] = v } if header
47
+
48
+ res =
49
+ Net::HTTP.start(href.hostname, href.port) do |http|
50
+ http.request(req)
51
+ end
52
+
53
+ if res.is_a?(Net::HTTPSuccess)
54
+ local_dir = File.dirname(local_path)
55
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
+ content = res.body
57
+ content = to_utf8(content) if convert_to_utf8
58
+ File.write(local_path, content)
59
+ puts 'succeed'
60
+ return true
61
+ else
62
+ puts res
63
+ end
64
+ rescue StandardError => e
65
+ puts e
66
+ false
67
+ end
68
+ false
69
+ end
70
+
71
+ def extract_href_last(origin_href)
72
+ origin_href.split('/')[-1]
73
+ end
74
+
75
+ def string_to_uri(href)
76
+ l = href
77
+ l.sub!('http:///', 'http://')
78
+ l = Addressable::URI.parse(l)
79
+ l.normalize!
80
+ end
81
+
82
+ BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
+ 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
+ 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
+ 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
+ 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
+
88
+ # 此函数有时此判断有误,使用to_utf8函数直接转换
89
+ def smart_to_utf8(str)
90
+ return str if str.encoding == Encoding::UTF_8
91
+ to_utf8(str)
92
+ end
93
+
94
+ def to_utf8(str)
95
+ # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
+ str.force_encoding(Encoding::ASCII_8BIT)
97
+ cd = CharDet.detect(str)
98
+ if cd['confidence'] > 0.6
99
+ puts cd['encoding']
100
+ str.force_encoding(cd['encoding'])
101
+ # 移除BOM头
102
+ bom_header = BomHeaderMap[cd['encoding']]
103
+ str.sub!(bom_header, '') if bom_header
104
+ end
105
+ str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
+
107
+ str
108
+ end
109
+ end
110
+ end
@@ -1,31 +1,31 @@
1
-
2
- lib = File.expand_path('lib', __dir__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'list_spider/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = 'list_spider'
8
- spec.version = ListSpider::VERSION
9
- spec.authors = ['Charles Zhang']
10
- spec.email = ['gis05zc@163.com']
11
-
12
- spec.summary = 'List Spider'
13
- spec.description = 'A url list spider based on em-http-request.'
14
- spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
- spec.license = 'MIT'
16
-
17
- spec.files =
18
- `git ls-files -z`.split("\x0").reject do |f|
19
- f.match(%r{^(test|spec|features)/})
20
- end
21
- spec.bindir = 'exe'
22
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
- spec.require_paths = ['lib']
24
-
25
- spec.add_development_dependency 'bundler', '~> 1.16'
26
- spec.add_development_dependency 'rake', '~> 10.0'
27
-
28
- spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.11'
30
- spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
- end
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'list_spider'
8
+ spec.version = ListSpider::VERSION
9
+ spec.authors = ['Charles Zhang']
10
+ spec.email = ['gis05zc@163.com']
11
+
12
+ spec.summary = 'List Spider'
13
+ spec.description = 'A url list spider based on em-http-request.'
14
+ spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files =
18
+ `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = 'exe'
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.16'
26
+ spec.add_development_dependency 'rake', '~> 10.0'
27
+
28
+ spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
+ spec.add_dependency 'nokogiri', '~> 1.10'
30
+ spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
+ end
@@ -1,27 +1,27 @@
1
- require 'list_spider'
2
- # require File.expand_path('../lib/list_spider', __FILE__)
3
-
4
- DOWNLOAD_DIR = 'coolshell/'.freeze
5
-
6
- def parse_index_item(e)
7
- content = File.read(e.local_path)
8
- doc = Nokogiri::HTML(content)
9
- list_group = doc.css('h2.entry-title')
10
- link_list = list_group.css('a')
11
-
12
- link_list.each do |link|
13
- href = link['href']
14
- local_path = DOWNLOAD_DIR + link.content + '.html'
15
- ListSpider.add_task(TaskStruct.new(href, local_path))
16
- end
17
- end
18
-
19
- # get_one is a simple function for one taskstruct situation
20
- ListSpider.get_one(
21
- TaskStruct.new(
22
- 'https://coolshell.cn/',
23
- DOWNLOAD_DIR + 'index.html',
24
- parse_method: method(:parse_index_item)
25
- ),
26
- max: 60
27
- )
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ DOWNLOAD_DIR = 'coolshell/'.freeze
5
+
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ local_path = DOWNLOAD_DIR + link.content + '.html'
15
+ ListSpider.add_task(TaskStruct.new(href, local_path))
16
+ end
17
+ end
18
+
19
+ # get_one is a simple function for one taskstruct situation
20
+ ListSpider.get_one(
21
+ TaskStruct.new(
22
+ 'https://coolshell.cn/',
23
+ DOWNLOAD_DIR + 'index.html',
24
+ parse_method: method(:parse_index_item)
25
+ ),
26
+ max: 60
27
+ )
@@ -1,29 +1,29 @@
1
- require 'list_spider'
2
-
3
- DOWNLOAD_DIR = 'coolshell/'.freeze
4
-
5
- @next_list = []
6
-
7
- def parse_index_item(e)
8
- content = File.read(e.local_path)
9
- doc = Nokogiri::HTML(content)
10
- list_group = doc.css('h2.entry-title')
11
- link_list = list_group.css('a')
12
-
13
- link_list.each do |link|
14
- href = link['href']
15
- local_path = DOWNLOAD_DIR + link.content + '.html'
16
- # or you can save them to database for later use
17
- @next_list << TaskStruct.new(href, local_path)
18
- end
19
- end
20
-
21
- task_list = []
22
- task_list << TaskStruct.new(
23
- 'https://coolshell.cn/',
24
- DOWNLOAD_DIR + 'index.html',
25
- parse_method: method(:parse_index_item)
26
- )
27
-
28
- ListSpider.get_list(task_list)
29
- ListSpider.get_list(@next_list, max: 60)
1
+ require 'list_spider'
2
+
3
+ DOWNLOAD_DIR = 'coolshell/'.freeze
4
+
5
+ @next_list = []
6
+
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
+ doc = Nokogiri::HTML(content)
10
+ list_group = doc.css('h2.entry-title')
11
+ link_list = list_group.css('a')
12
+
13
+ link_list.each do |link|
14
+ href = link['href']
15
+ local_path = DOWNLOAD_DIR + link.content + '.html'
16
+ # or you can save them to database for later use
17
+ @next_list << TaskStruct.new(href, local_path)
18
+ end
19
+ end
20
+
21
+ task_list = []
22
+ task_list << TaskStruct.new(
23
+ 'https://coolshell.cn/',
24
+ DOWNLOAD_DIR + 'index.html',
25
+ parse_method: method(:parse_index_item)
26
+ )
27
+
28
+ ListSpider.get_list(task_list)
29
+ ListSpider.get_list(@next_list, max: 60)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-09 00:00:00.000000000 Z
11
+ date: 2020-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -64,14 +64,14 @@ dependencies:
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: '1.11'
67
+ version: '1.10'
68
68
  type: :runtime
69
69
  prerelease: false
70
70
  version_requirements: !ruby/object:Gem::Requirement
71
71
  requirements:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
- version: '1.11'
74
+ version: '1.10'
75
75
  - !ruby/object:Gem::Dependency
76
76
  name: rchardet
77
77
  requirement: !ruby/object:Gem::Requirement
@@ -136,7 +136,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
136
136
  - !ruby/object:Gem::Version
137
137
  version: '0'
138
138
  requirements: []
139
- rubygems_version: 3.0.3
139
+ rubyforge_project:
140
+ rubygems_version: 2.7.6
140
141
  signing_key:
141
142
  specification_version: 4
142
143
  summary: List Spider