list_spider 2.3.0 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
- module ListSpider
2
- VERSION = '2.3.0'.freeze
3
- end
1
+ module ListSpider
2
+ VERSION = '2.4.0'.freeze
3
+ end
data/lib/spider_helper.rb CHANGED
@@ -1,110 +1,110 @@
1
- require 'rchardet'
2
- require 'net/http'
3
-
4
- module SpiderHelper
5
- class << self
6
- def direct_http_get(href, local_path, params: nil,
7
- header: nil, convert_to_utf8: false)
8
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
-
10
- begin
11
- href.query = URI.encode_www_form(params) if params
12
- req = Net::HTTP::Get.new(href)
13
- header.each { |k, v| req[k] = v } if header
14
-
15
- res =
16
- Net::HTTP.start(href.hostname, href.port) do |http|
17
- http.request(req)
18
- end
19
-
20
- if res.is_a?(Net::HTTPSuccess)
21
- local_dir = File.dirname(local_path)
22
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
- content = res.body
24
- content = to_utf8(content) if convert_to_utf8
25
- File.write(local_path, content)
26
- puts 'succeed'
27
- return true
28
- else
29
- puts res
30
- end
31
- rescue StandardError => e
32
- puts e.backtrace
33
- puts e
34
- false
35
- end
36
- false
37
- end
38
-
39
- def direct_http_post(href, local_path, params,
40
- header: nil, convert_to_utf8: false)
41
- href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
-
43
- begin
44
- req = Net::HTTP::Post.new(href)
45
- req.set_form_data(params)
46
- header.each { |k, v| req[k] = v } if header
47
-
48
- res =
49
- Net::HTTP.start(href.hostname, href.port) do |http|
50
- http.request(req)
51
- end
52
-
53
- if res.is_a?(Net::HTTPSuccess)
54
- local_dir = File.dirname(local_path)
55
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
- content = res.body
57
- content = to_utf8(content) if convert_to_utf8
58
- File.write(local_path, content)
59
- puts 'succeed'
60
- return true
61
- else
62
- puts res
63
- end
64
- rescue StandardError => e
65
- puts e
66
- false
67
- end
68
- false
69
- end
70
-
71
- def extract_href_last(origin_href)
72
- origin_href.split('/')[-1]
73
- end
74
-
75
- def string_to_uri(href)
76
- l = href
77
- l.sub!('http:///', 'http://')
78
- l = Addressable::URI.parse(l)
79
- l.normalize!
80
- end
81
-
82
- BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
- 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
- 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
- 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
- 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
-
88
- # 此函数有时此判断有误,使用to_utf8函数直接转换
89
- def smart_to_utf8(str)
90
- return str if str.encoding == Encoding::UTF_8
91
- to_utf8(str)
92
- end
93
-
94
- def to_utf8(str)
95
- # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
- str.force_encoding(Encoding::ASCII_8BIT)
97
- cd = CharDet.detect(str)
98
- if cd['confidence'] > 0.6
99
- puts cd['encoding']
100
- str.force_encoding(cd['encoding'])
101
- # 移除BOM头
102
- bom_header = BomHeaderMap[cd['encoding']]
103
- str.sub!(bom_header, '') if bom_header
104
- end
105
- str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
-
107
- str
108
- end
109
- end
110
- end
1
+ require 'rchardet'
2
+ require 'net/http'
3
+
4
+ module SpiderHelper
5
+ class << self
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
9
+
10
+ begin
11
+ href.query = URI.encode_www_form(params) if params
12
+ req = Net::HTTP::Get.new(href)
13
+ header.each { |k, v| req[k] = v } if header
14
+
15
+ res =
16
+ Net::HTTP.start(href.hostname, href.port) do |http|
17
+ http.request(req)
18
+ end
19
+
20
+ if res.is_a?(Net::HTTPSuccess)
21
+ local_dir = File.dirname(local_path)
22
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
23
+ content = res.body
24
+ content = to_utf8(content) if convert_to_utf8
25
+ File.write(local_path, content)
26
+ puts 'succeed'
27
+ return true
28
+ else
29
+ puts res
30
+ end
31
+ rescue StandardError => e
32
+ puts e.backtrace
33
+ puts e
34
+ false
35
+ end
36
+ false
37
+ end
38
+
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
42
+
43
+ begin
44
+ req = Net::HTTP::Post.new(href)
45
+ req.set_form_data(params)
46
+ header.each { |k, v| req[k] = v } if header
47
+
48
+ res =
49
+ Net::HTTP.start(href.hostname, href.port) do |http|
50
+ http.request(req)
51
+ end
52
+
53
+ if res.is_a?(Net::HTTPSuccess)
54
+ local_dir = File.dirname(local_path)
55
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
56
+ content = res.body
57
+ content = to_utf8(content) if convert_to_utf8
58
+ File.write(local_path, content)
59
+ puts 'succeed'
60
+ return true
61
+ else
62
+ puts res
63
+ end
64
+ rescue StandardError => e
65
+ puts e
66
+ false
67
+ end
68
+ false
69
+ end
70
+
71
+ def extract_href_last(origin_href)
72
+ origin_href.split('/')[-1]
73
+ end
74
+
75
+ def string_to_uri(href)
76
+ l = href
77
+ l.sub!('http:///', 'http://')
78
+ l = Addressable::URI.parse(l)
79
+ l.normalize!
80
+ end
81
+
82
+ BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
83
+ 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
84
+ 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
85
+ 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
86
+ 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
87
+
88
+ # 此函数有时此判断有误,使用to_utf8函数直接转换
89
+ def smart_to_utf8(str)
90
+ return str if str.encoding == Encoding::UTF_8
91
+ to_utf8(str)
92
+ end
93
+
94
+ def to_utf8(str)
95
+ # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
96
+ str.force_encoding(Encoding::ASCII_8BIT)
97
+ cd = CharDet.detect(str)
98
+ if cd['confidence'] > 0.6
99
+ puts cd['encoding']
100
+ str.force_encoding(cd['encoding'])
101
+ # 移除BOM头
102
+ bom_header = BomHeaderMap[cd['encoding']]
103
+ str.sub!(bom_header, '') if bom_header
104
+ end
105
+ str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
106
+
107
+ str
108
+ end
109
+ end
110
+ end
data/list_spider.gemspec CHANGED
@@ -1,31 +1,31 @@
1
-
2
- lib = File.expand_path('lib', __dir__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'list_spider/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = 'list_spider'
8
- spec.version = ListSpider::VERSION
9
- spec.authors = ['Charles Zhang']
10
- spec.email = ['gis05zc@163.com']
11
-
12
- spec.summary = 'List Spider'
13
- spec.description = 'A url list spider based on em-http-request.'
14
- spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
- spec.license = 'MIT'
16
-
17
- spec.files =
18
- `git ls-files -z`.split("\x0").reject do |f|
19
- f.match(%r{^(test|spec|features)/})
20
- end
21
- spec.bindir = 'exe'
22
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
- spec.require_paths = ['lib']
24
-
25
- spec.add_development_dependency 'bundler', '~> 1.16'
26
- spec.add_development_dependency 'rake', '~> 10.0'
27
-
28
- spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
- spec.add_dependency 'nokogiri', '~> 1.10'
30
- spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
- end
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'list_spider'
8
+ spec.version = ListSpider::VERSION
9
+ spec.authors = ['Charles Zhang']
10
+ spec.email = ['gis05zc@163.com']
11
+
12
+ spec.summary = 'List Spider'
13
+ spec.description = 'A url list spider based on em-http-request.'
14
+ spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files =
18
+ `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = 'exe'
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.16'
26
+ spec.add_development_dependency 'rake', '~> 10.0'
27
+
28
+ spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
+ spec.add_dependency 'nokogiri', '~> 1.10'
30
+ spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
+ end
data/spider_example.rb CHANGED
@@ -1,27 +1,27 @@
1
- require 'list_spider'
2
- # require File.expand_path('../lib/list_spider', __FILE__)
3
-
4
- DOWNLOAD_DIR = 'coolshell/'.freeze
5
-
6
- def parse_index_item(e)
7
- content = File.read(e.local_path)
8
- doc = Nokogiri::HTML(content)
9
- list_group = doc.css('h2.entry-title')
10
- link_list = list_group.css('a')
11
-
12
- link_list.each do |link|
13
- href = link['href']
14
- local_path = DOWNLOAD_DIR + link.content + '.html'
15
- ListSpider.add_task(TaskStruct.new(href, local_path))
16
- end
17
- end
18
-
19
- # get_one is a simple function for one taskstruct situation
20
- ListSpider.get_one(
21
- TaskStruct.new(
22
- 'https://coolshell.cn/',
23
- DOWNLOAD_DIR + 'index.html',
24
- parse_method: method(:parse_index_item)
25
- ),
26
- max: 60
27
- )
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ DOWNLOAD_DIR = 'coolshell/'.freeze
5
+
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ local_path = DOWNLOAD_DIR + link.content + '.html'
15
+ ListSpider.add_task(TaskStruct.new(href, local_path))
16
+ end
17
+ end
18
+
19
+ # get_one is a simple function for one taskstruct situation
20
+ ListSpider.get_one(
21
+ TaskStruct.new(
22
+ 'https://coolshell.cn/',
23
+ DOWNLOAD_DIR + 'index.html',
24
+ parse_method: method(:parse_index_item)
25
+ ),
26
+ max: 60
27
+ )
data/spider_example_2.rb CHANGED
@@ -1,29 +1,29 @@
1
- require 'list_spider'
2
-
3
- DOWNLOAD_DIR = 'coolshell/'.freeze
4
-
5
- @next_list = []
6
-
7
- def parse_index_item(e)
8
- content = File.read(e.local_path)
9
- doc = Nokogiri::HTML(content)
10
- list_group = doc.css('h2.entry-title')
11
- link_list = list_group.css('a')
12
-
13
- link_list.each do |link|
14
- href = link['href']
15
- local_path = DOWNLOAD_DIR + link.content + '.html'
16
- # or you can save them to database for later use
17
- @next_list << TaskStruct.new(href, local_path)
18
- end
19
- end
20
-
21
- task_list = []
22
- task_list << TaskStruct.new(
23
- 'https://coolshell.cn/',
24
- DOWNLOAD_DIR + 'index.html',
25
- parse_method: method(:parse_index_item)
26
- )
27
-
28
- ListSpider.get_list(task_list)
29
- ListSpider.get_list(@next_list, max: 60)
1
+ require 'list_spider'
2
+
3
+ DOWNLOAD_DIR = 'coolshell/'.freeze
4
+
5
+ @next_list = []
6
+
7
+ def parse_index_item(e)
8
+ content = File.read(e.local_path)
9
+ doc = Nokogiri::HTML(content)
10
+ list_group = doc.css('h2.entry-title')
11
+ link_list = list_group.css('a')
12
+
13
+ link_list.each do |link|
14
+ href = link['href']
15
+ local_path = DOWNLOAD_DIR + link.content + '.html'
16
+ # or you can save them to database for later use
17
+ @next_list << TaskStruct.new(href, local_path)
18
+ end
19
+ end
20
+
21
+ task_list = []
22
+ task_list << TaskStruct.new(
23
+ 'https://coolshell.cn/',
24
+ DOWNLOAD_DIR + 'index.html',
25
+ parse_method: method(:parse_index_item)
26
+ )
27
+
28
+ ListSpider.get_list(task_list)
29
+ ListSpider.get_list(@next_list, max: 60)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-06 00:00:00.000000000 Z
11
+ date: 2020-03-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
136
136
  - !ruby/object:Gem::Version
137
137
  version: '0'
138
138
  requirements: []
139
- rubyforge_project:
140
- rubygems_version: 2.7.6
139
+ rubygems_version: 3.0.3
141
140
  signing_key:
142
141
  specification_version: 4
143
142
  summary: List Spider