list_spider 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +298 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +3 -4
data/lib/list_spider/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module ListSpider
|
2
|
-
VERSION = '2.
|
3
|
-
end
|
1
|
+
module ListSpider
|
2
|
+
VERSION = '2.4.0'.freeze
|
3
|
+
end
|
data/lib/spider_helper.rb
CHANGED
@@ -1,110 +1,110 @@
|
|
1
|
-
require 'rchardet'
|
2
|
-
require 'net/http'
|
3
|
-
|
4
|
-
module SpiderHelper
|
5
|
-
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil,
|
7
|
-
header: nil, convert_to_utf8: false)
|
8
|
-
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
9
|
-
|
10
|
-
begin
|
11
|
-
href.query = URI.encode_www_form(params) if params
|
12
|
-
req = Net::HTTP::Get.new(href)
|
13
|
-
header.each { |k, v| req[k] = v } if header
|
14
|
-
|
15
|
-
res =
|
16
|
-
Net::HTTP.start(href.hostname, href.port) do |http|
|
17
|
-
http.request(req)
|
18
|
-
end
|
19
|
-
|
20
|
-
if res.is_a?(Net::HTTPSuccess)
|
21
|
-
local_dir = File.dirname(local_path)
|
22
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
23
|
-
content = res.body
|
24
|
-
content = to_utf8(content) if convert_to_utf8
|
25
|
-
File.write(local_path, content)
|
26
|
-
puts 'succeed'
|
27
|
-
return true
|
28
|
-
else
|
29
|
-
puts res
|
30
|
-
end
|
31
|
-
rescue StandardError => e
|
32
|
-
puts e.backtrace
|
33
|
-
puts e
|
34
|
-
false
|
35
|
-
end
|
36
|
-
false
|
37
|
-
end
|
38
|
-
|
39
|
-
def direct_http_post(href, local_path, params,
|
40
|
-
header: nil, convert_to_utf8: false)
|
41
|
-
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
42
|
-
|
43
|
-
begin
|
44
|
-
req = Net::HTTP::Post.new(href)
|
45
|
-
req.set_form_data(params)
|
46
|
-
header.each { |k, v| req[k] = v } if header
|
47
|
-
|
48
|
-
res =
|
49
|
-
Net::HTTP.start(href.hostname, href.port) do |http|
|
50
|
-
http.request(req)
|
51
|
-
end
|
52
|
-
|
53
|
-
if res.is_a?(Net::HTTPSuccess)
|
54
|
-
local_dir = File.dirname(local_path)
|
55
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
56
|
-
content = res.body
|
57
|
-
content = to_utf8(content) if convert_to_utf8
|
58
|
-
File.write(local_path, content)
|
59
|
-
puts 'succeed'
|
60
|
-
return true
|
61
|
-
else
|
62
|
-
puts res
|
63
|
-
end
|
64
|
-
rescue StandardError => e
|
65
|
-
puts e
|
66
|
-
false
|
67
|
-
end
|
68
|
-
false
|
69
|
-
end
|
70
|
-
|
71
|
-
def extract_href_last(origin_href)
|
72
|
-
origin_href.split('/')[-1]
|
73
|
-
end
|
74
|
-
|
75
|
-
def string_to_uri(href)
|
76
|
-
l = href
|
77
|
-
l.sub!('http:///', 'http://')
|
78
|
-
l = Addressable::URI.parse(l)
|
79
|
-
l.normalize!
|
80
|
-
end
|
81
|
-
|
82
|
-
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
83
|
-
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
84
|
-
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
85
|
-
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
86
|
-
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
87
|
-
|
88
|
-
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
89
|
-
def smart_to_utf8(str)
|
90
|
-
return str if str.encoding == Encoding::UTF_8
|
91
|
-
to_utf8(str)
|
92
|
-
end
|
93
|
-
|
94
|
-
def to_utf8(str)
|
95
|
-
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
96
|
-
str.force_encoding(Encoding::ASCII_8BIT)
|
97
|
-
cd = CharDet.detect(str)
|
98
|
-
if cd['confidence'] > 0.6
|
99
|
-
puts cd['encoding']
|
100
|
-
str.force_encoding(cd['encoding'])
|
101
|
-
# 移除BOM头
|
102
|
-
bom_header = BomHeaderMap[cd['encoding']]
|
103
|
-
str.sub!(bom_header, '') if bom_header
|
104
|
-
end
|
105
|
-
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
106
|
-
|
107
|
-
str
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
1
|
+
require 'rchardet'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module SpiderHelper
|
5
|
+
class << self
|
6
|
+
def direct_http_get(href, local_path, params: nil,
|
7
|
+
header: nil, convert_to_utf8: false)
|
8
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
9
|
+
|
10
|
+
begin
|
11
|
+
href.query = URI.encode_www_form(params) if params
|
12
|
+
req = Net::HTTP::Get.new(href)
|
13
|
+
header.each { |k, v| req[k] = v } if header
|
14
|
+
|
15
|
+
res =
|
16
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
17
|
+
http.request(req)
|
18
|
+
end
|
19
|
+
|
20
|
+
if res.is_a?(Net::HTTPSuccess)
|
21
|
+
local_dir = File.dirname(local_path)
|
22
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
23
|
+
content = res.body
|
24
|
+
content = to_utf8(content) if convert_to_utf8
|
25
|
+
File.write(local_path, content)
|
26
|
+
puts 'succeed'
|
27
|
+
return true
|
28
|
+
else
|
29
|
+
puts res
|
30
|
+
end
|
31
|
+
rescue StandardError => e
|
32
|
+
puts e.backtrace
|
33
|
+
puts e
|
34
|
+
false
|
35
|
+
end
|
36
|
+
false
|
37
|
+
end
|
38
|
+
|
39
|
+
def direct_http_post(href, local_path, params,
|
40
|
+
header: nil, convert_to_utf8: false)
|
41
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
42
|
+
|
43
|
+
begin
|
44
|
+
req = Net::HTTP::Post.new(href)
|
45
|
+
req.set_form_data(params)
|
46
|
+
header.each { |k, v| req[k] = v } if header
|
47
|
+
|
48
|
+
res =
|
49
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
50
|
+
http.request(req)
|
51
|
+
end
|
52
|
+
|
53
|
+
if res.is_a?(Net::HTTPSuccess)
|
54
|
+
local_dir = File.dirname(local_path)
|
55
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
56
|
+
content = res.body
|
57
|
+
content = to_utf8(content) if convert_to_utf8
|
58
|
+
File.write(local_path, content)
|
59
|
+
puts 'succeed'
|
60
|
+
return true
|
61
|
+
else
|
62
|
+
puts res
|
63
|
+
end
|
64
|
+
rescue StandardError => e
|
65
|
+
puts e
|
66
|
+
false
|
67
|
+
end
|
68
|
+
false
|
69
|
+
end
|
70
|
+
|
71
|
+
def extract_href_last(origin_href)
|
72
|
+
origin_href.split('/')[-1]
|
73
|
+
end
|
74
|
+
|
75
|
+
def string_to_uri(href)
|
76
|
+
l = href
|
77
|
+
l.sub!('http:///', 'http://')
|
78
|
+
l = Addressable::URI.parse(l)
|
79
|
+
l.normalize!
|
80
|
+
end
|
81
|
+
|
82
|
+
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
83
|
+
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
84
|
+
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
85
|
+
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
86
|
+
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
87
|
+
|
88
|
+
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
89
|
+
def smart_to_utf8(str)
|
90
|
+
return str if str.encoding == Encoding::UTF_8
|
91
|
+
to_utf8(str)
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_utf8(str)
|
95
|
+
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
96
|
+
str.force_encoding(Encoding::ASCII_8BIT)
|
97
|
+
cd = CharDet.detect(str)
|
98
|
+
if cd['confidence'] > 0.6
|
99
|
+
puts cd['encoding']
|
100
|
+
str.force_encoding(cd['encoding'])
|
101
|
+
# 移除BOM头
|
102
|
+
bom_header = BomHeaderMap[cd['encoding']]
|
103
|
+
str.sub!(bom_header, '') if bom_header
|
104
|
+
end
|
105
|
+
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
106
|
+
|
107
|
+
str
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
data/list_spider.gemspec
CHANGED
@@ -1,31 +1,31 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('lib', __dir__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'list_spider/version'
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = 'list_spider'
|
8
|
-
spec.version = ListSpider::VERSION
|
9
|
-
spec.authors = ['Charles Zhang']
|
10
|
-
spec.email = ['gis05zc@163.com']
|
11
|
-
|
12
|
-
spec.summary = 'List Spider'
|
13
|
-
spec.description = 'A url list spider based on em-http-request.'
|
14
|
-
spec.homepage = 'https://github.com/chinazhangchao/list_spider'
|
15
|
-
spec.license = 'MIT'
|
16
|
-
|
17
|
-
spec.files =
|
18
|
-
`git ls-files -z`.split("\x0").reject do |f|
|
19
|
-
f.match(%r{^(test|spec|features)/})
|
20
|
-
end
|
21
|
-
spec.bindir = 'exe'
|
22
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
-
spec.require_paths = ['lib']
|
24
|
-
|
25
|
-
spec.add_development_dependency 'bundler', '~> 1.16'
|
26
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
27
|
-
|
28
|
-
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
-
spec.add_dependency 'nokogiri', '~> 1.10'
|
30
|
-
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
|
-
end
|
1
|
+
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'list_spider/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'list_spider'
|
8
|
+
spec.version = ListSpider::VERSION
|
9
|
+
spec.authors = ['Charles Zhang']
|
10
|
+
spec.email = ['gis05zc@163.com']
|
11
|
+
|
12
|
+
spec.summary = 'List Spider'
|
13
|
+
spec.description = 'A url list spider based on em-http-request.'
|
14
|
+
spec.homepage = 'https://github.com/chinazhangchao/list_spider'
|
15
|
+
spec.license = 'MIT'
|
16
|
+
|
17
|
+
spec.files =
|
18
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
19
|
+
f.match(%r{^(test|spec|features)/})
|
20
|
+
end
|
21
|
+
spec.bindir = 'exe'
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
|
25
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
26
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
27
|
+
|
28
|
+
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
+
spec.add_dependency 'nokogiri', '~> 1.10'
|
30
|
+
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
|
+
end
|
data/spider_example.rb
CHANGED
@@ -1,27 +1,27 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
-
|
4
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
|
-
|
6
|
-
def parse_index_item(e)
|
7
|
-
content = File.read(e.local_path)
|
8
|
-
doc = Nokogiri::HTML(content)
|
9
|
-
list_group = doc.css('h2.entry-title')
|
10
|
-
link_list = list_group.css('a')
|
11
|
-
|
12
|
-
link_list.each do |link|
|
13
|
-
href = link['href']
|
14
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
15
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
# get_one is a simple function for one taskstruct situation
|
20
|
-
ListSpider.get_one(
|
21
|
-
TaskStruct.new(
|
22
|
-
'https://coolshell.cn/',
|
23
|
-
DOWNLOAD_DIR + 'index.html',
|
24
|
-
parse_method: method(:parse_index_item)
|
25
|
-
),
|
26
|
-
max: 60
|
27
|
-
)
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
+
|
4
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
|
+
|
6
|
+
def parse_index_item(e)
|
7
|
+
content = File.read(e.local_path)
|
8
|
+
doc = Nokogiri::HTML(content)
|
9
|
+
list_group = doc.css('h2.entry-title')
|
10
|
+
link_list = list_group.css('a')
|
11
|
+
|
12
|
+
link_list.each do |link|
|
13
|
+
href = link['href']
|
14
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
15
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# get_one is a simple function for one taskstruct situation
|
20
|
+
ListSpider.get_one(
|
21
|
+
TaskStruct.new(
|
22
|
+
'https://coolshell.cn/',
|
23
|
+
DOWNLOAD_DIR + 'index.html',
|
24
|
+
parse_method: method(:parse_index_item)
|
25
|
+
),
|
26
|
+
max: 60
|
27
|
+
)
|
data/spider_example_2.rb
CHANGED
@@ -1,29 +1,29 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
|
3
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
4
|
-
|
5
|
-
@next_list = []
|
6
|
-
|
7
|
-
def parse_index_item(e)
|
8
|
-
content = File.read(e.local_path)
|
9
|
-
doc = Nokogiri::HTML(content)
|
10
|
-
list_group = doc.css('h2.entry-title')
|
11
|
-
link_list = list_group.css('a')
|
12
|
-
|
13
|
-
link_list.each do |link|
|
14
|
-
href = link['href']
|
15
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
16
|
-
# or you can save them to database for later use
|
17
|
-
@next_list << TaskStruct.new(href, local_path)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
task_list = []
|
22
|
-
task_list << TaskStruct.new(
|
23
|
-
'https://coolshell.cn/',
|
24
|
-
DOWNLOAD_DIR + 'index.html',
|
25
|
-
parse_method: method(:parse_index_item)
|
26
|
-
)
|
27
|
-
|
28
|
-
ListSpider.get_list(task_list)
|
29
|
-
ListSpider.get_list(@next_list, max: 60)
|
1
|
+
require 'list_spider'
|
2
|
+
|
3
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
4
|
+
|
5
|
+
@next_list = []
|
6
|
+
|
7
|
+
def parse_index_item(e)
|
8
|
+
content = File.read(e.local_path)
|
9
|
+
doc = Nokogiri::HTML(content)
|
10
|
+
list_group = doc.css('h2.entry-title')
|
11
|
+
link_list = list_group.css('a')
|
12
|
+
|
13
|
+
link_list.each do |link|
|
14
|
+
href = link['href']
|
15
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
16
|
+
# or you can save them to database for later use
|
17
|
+
@next_list << TaskStruct.new(href, local_path)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
task_list = []
|
22
|
+
task_list << TaskStruct.new(
|
23
|
+
'https://coolshell.cn/',
|
24
|
+
DOWNLOAD_DIR + 'index.html',
|
25
|
+
parse_method: method(:parse_index_item)
|
26
|
+
)
|
27
|
+
|
28
|
+
ListSpider.get_list(task_list)
|
29
|
+
ListSpider.get_list(@next_list, max: 60)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-03-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -136,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
136
|
- !ruby/object:Gem::Version
|
137
137
|
version: '0'
|
138
138
|
requirements: []
|
139
|
-
|
140
|
-
rubygems_version: 2.7.6
|
139
|
+
rubygems_version: 3.0.3
|
141
140
|
signing_key:
|
142
141
|
specification_version: 4
|
143
142
|
summary: List Spider
|