list_spider 2.2.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/Gemfile.lock +12 -11
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +297 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +6 -5
data/lib/list_spider/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module ListSpider
|
2
|
-
VERSION = '2.
|
3
|
-
end
|
1
|
+
module ListSpider
|
2
|
+
VERSION = '2.3.0'.freeze
|
3
|
+
end
|
data/lib/spider_helper.rb
CHANGED
@@ -1,110 +1,110 @@
|
|
1
|
-
require 'rchardet'
|
2
|
-
require 'net/http'
|
3
|
-
|
4
|
-
module SpiderHelper
|
5
|
-
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil,
|
7
|
-
header: nil, convert_to_utf8: false)
|
8
|
-
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
9
|
-
|
10
|
-
begin
|
11
|
-
href.query = URI.encode_www_form(params) if params
|
12
|
-
req = Net::HTTP::Get.new(href)
|
13
|
-
header.each { |k, v| req[k] = v } if header
|
14
|
-
|
15
|
-
res =
|
16
|
-
Net::HTTP.start(href.hostname, href.port) do |http|
|
17
|
-
http.request(req)
|
18
|
-
end
|
19
|
-
|
20
|
-
if res.is_a?(Net::HTTPSuccess)
|
21
|
-
local_dir = File.dirname(local_path)
|
22
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
23
|
-
content = res.body
|
24
|
-
content = to_utf8(content) if convert_to_utf8
|
25
|
-
File.write(local_path, content)
|
26
|
-
puts 'succeed'
|
27
|
-
return true
|
28
|
-
else
|
29
|
-
puts res
|
30
|
-
end
|
31
|
-
rescue StandardError => e
|
32
|
-
puts e.backtrace
|
33
|
-
puts e
|
34
|
-
false
|
35
|
-
end
|
36
|
-
false
|
37
|
-
end
|
38
|
-
|
39
|
-
def direct_http_post(href, local_path, params,
|
40
|
-
header: nil, convert_to_utf8: false)
|
41
|
-
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
42
|
-
|
43
|
-
begin
|
44
|
-
req = Net::HTTP::Post.new(href)
|
45
|
-
req.set_form_data(params)
|
46
|
-
header.each { |k, v| req[k] = v } if header
|
47
|
-
|
48
|
-
res =
|
49
|
-
Net::HTTP.start(href.hostname, href.port) do |http|
|
50
|
-
http.request(req)
|
51
|
-
end
|
52
|
-
|
53
|
-
if res.is_a?(Net::HTTPSuccess)
|
54
|
-
local_dir = File.dirname(local_path)
|
55
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
56
|
-
content = res.body
|
57
|
-
content = to_utf8(content) if convert_to_utf8
|
58
|
-
File.write(local_path, content)
|
59
|
-
puts 'succeed'
|
60
|
-
return true
|
61
|
-
else
|
62
|
-
puts res
|
63
|
-
end
|
64
|
-
rescue StandardError => e
|
65
|
-
puts e
|
66
|
-
false
|
67
|
-
end
|
68
|
-
false
|
69
|
-
end
|
70
|
-
|
71
|
-
def extract_href_last(origin_href)
|
72
|
-
origin_href.split('/')[-1]
|
73
|
-
end
|
74
|
-
|
75
|
-
def string_to_uri(href)
|
76
|
-
l = href
|
77
|
-
l.sub!('http:///', 'http://')
|
78
|
-
l = Addressable::URI.parse(l)
|
79
|
-
l.normalize!
|
80
|
-
end
|
81
|
-
|
82
|
-
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
83
|
-
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
84
|
-
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
85
|
-
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
86
|
-
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
87
|
-
|
88
|
-
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
89
|
-
def smart_to_utf8(str)
|
90
|
-
return str if str.encoding == Encoding::UTF_8
|
91
|
-
to_utf8(str)
|
92
|
-
end
|
93
|
-
|
94
|
-
def to_utf8(str)
|
95
|
-
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
96
|
-
str.force_encoding(Encoding::ASCII_8BIT)
|
97
|
-
cd = CharDet.detect(str)
|
98
|
-
if cd['confidence'] > 0.6
|
99
|
-
puts cd['encoding']
|
100
|
-
str.force_encoding(cd['encoding'])
|
101
|
-
# 移除BOM头
|
102
|
-
bom_header = BomHeaderMap[cd['encoding']]
|
103
|
-
str.sub!(bom_header, '') if bom_header
|
104
|
-
end
|
105
|
-
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
106
|
-
|
107
|
-
str
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
1
|
+
require 'rchardet'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module SpiderHelper
|
5
|
+
class << self
|
6
|
+
def direct_http_get(href, local_path, params: nil,
|
7
|
+
header: nil, convert_to_utf8: false)
|
8
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
9
|
+
|
10
|
+
begin
|
11
|
+
href.query = URI.encode_www_form(params) if params
|
12
|
+
req = Net::HTTP::Get.new(href)
|
13
|
+
header.each { |k, v| req[k] = v } if header
|
14
|
+
|
15
|
+
res =
|
16
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
17
|
+
http.request(req)
|
18
|
+
end
|
19
|
+
|
20
|
+
if res.is_a?(Net::HTTPSuccess)
|
21
|
+
local_dir = File.dirname(local_path)
|
22
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
23
|
+
content = res.body
|
24
|
+
content = to_utf8(content) if convert_to_utf8
|
25
|
+
File.write(local_path, content)
|
26
|
+
puts 'succeed'
|
27
|
+
return true
|
28
|
+
else
|
29
|
+
puts res
|
30
|
+
end
|
31
|
+
rescue StandardError => e
|
32
|
+
puts e.backtrace
|
33
|
+
puts e
|
34
|
+
false
|
35
|
+
end
|
36
|
+
false
|
37
|
+
end
|
38
|
+
|
39
|
+
def direct_http_post(href, local_path, params,
|
40
|
+
header: nil, convert_to_utf8: false)
|
41
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
42
|
+
|
43
|
+
begin
|
44
|
+
req = Net::HTTP::Post.new(href)
|
45
|
+
req.set_form_data(params)
|
46
|
+
header.each { |k, v| req[k] = v } if header
|
47
|
+
|
48
|
+
res =
|
49
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
50
|
+
http.request(req)
|
51
|
+
end
|
52
|
+
|
53
|
+
if res.is_a?(Net::HTTPSuccess)
|
54
|
+
local_dir = File.dirname(local_path)
|
55
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
56
|
+
content = res.body
|
57
|
+
content = to_utf8(content) if convert_to_utf8
|
58
|
+
File.write(local_path, content)
|
59
|
+
puts 'succeed'
|
60
|
+
return true
|
61
|
+
else
|
62
|
+
puts res
|
63
|
+
end
|
64
|
+
rescue StandardError => e
|
65
|
+
puts e
|
66
|
+
false
|
67
|
+
end
|
68
|
+
false
|
69
|
+
end
|
70
|
+
|
71
|
+
def extract_href_last(origin_href)
|
72
|
+
origin_href.split('/')[-1]
|
73
|
+
end
|
74
|
+
|
75
|
+
def string_to_uri(href)
|
76
|
+
l = href
|
77
|
+
l.sub!('http:///', 'http://')
|
78
|
+
l = Addressable::URI.parse(l)
|
79
|
+
l.normalize!
|
80
|
+
end
|
81
|
+
|
82
|
+
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
83
|
+
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
84
|
+
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
85
|
+
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
86
|
+
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
87
|
+
|
88
|
+
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
89
|
+
def smart_to_utf8(str)
|
90
|
+
return str if str.encoding == Encoding::UTF_8
|
91
|
+
to_utf8(str)
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_utf8(str)
|
95
|
+
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
96
|
+
str.force_encoding(Encoding::ASCII_8BIT)
|
97
|
+
cd = CharDet.detect(str)
|
98
|
+
if cd['confidence'] > 0.6
|
99
|
+
puts cd['encoding']
|
100
|
+
str.force_encoding(cd['encoding'])
|
101
|
+
# 移除BOM头
|
102
|
+
bom_header = BomHeaderMap[cd['encoding']]
|
103
|
+
str.sub!(bom_header, '') if bom_header
|
104
|
+
end
|
105
|
+
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
106
|
+
|
107
|
+
str
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
data/list_spider.gemspec
CHANGED
@@ -1,31 +1,31 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('lib', __dir__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'list_spider/version'
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = 'list_spider'
|
8
|
-
spec.version = ListSpider::VERSION
|
9
|
-
spec.authors = ['Charles Zhang']
|
10
|
-
spec.email = ['gis05zc@163.com']
|
11
|
-
|
12
|
-
spec.summary = 'List Spider'
|
13
|
-
spec.description = 'A url list spider based on em-http-request.'
|
14
|
-
spec.homepage = 'https://github.com/chinazhangchao/list_spider'
|
15
|
-
spec.license = 'MIT'
|
16
|
-
|
17
|
-
spec.files =
|
18
|
-
`git ls-files -z`.split("\x0").reject do |f|
|
19
|
-
f.match(%r{^(test|spec|features)/})
|
20
|
-
end
|
21
|
-
spec.bindir = 'exe'
|
22
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
-
spec.require_paths = ['lib']
|
24
|
-
|
25
|
-
spec.add_development_dependency 'bundler', '~> 1.16'
|
26
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
27
|
-
|
28
|
-
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
-
spec.add_dependency 'nokogiri', '~> 1.
|
30
|
-
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
|
-
end
|
1
|
+
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'list_spider/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'list_spider'
|
8
|
+
spec.version = ListSpider::VERSION
|
9
|
+
spec.authors = ['Charles Zhang']
|
10
|
+
spec.email = ['gis05zc@163.com']
|
11
|
+
|
12
|
+
spec.summary = 'List Spider'
|
13
|
+
spec.description = 'A url list spider based on em-http-request.'
|
14
|
+
spec.homepage = 'https://github.com/chinazhangchao/list_spider'
|
15
|
+
spec.license = 'MIT'
|
16
|
+
|
17
|
+
spec.files =
|
18
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
19
|
+
f.match(%r{^(test|spec|features)/})
|
20
|
+
end
|
21
|
+
spec.bindir = 'exe'
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
|
25
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
26
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
27
|
+
|
28
|
+
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
+
spec.add_dependency 'nokogiri', '~> 1.10'
|
30
|
+
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
|
+
end
|
data/spider_example.rb
CHANGED
@@ -1,27 +1,27 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
-
|
4
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
|
-
|
6
|
-
def parse_index_item(e)
|
7
|
-
content = File.read(e.local_path)
|
8
|
-
doc = Nokogiri::HTML(content)
|
9
|
-
list_group = doc.css('h2.entry-title')
|
10
|
-
link_list = list_group.css('a')
|
11
|
-
|
12
|
-
link_list.each do |link|
|
13
|
-
href = link['href']
|
14
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
15
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
# get_one is a simple function for one taskstruct situation
|
20
|
-
ListSpider.get_one(
|
21
|
-
TaskStruct.new(
|
22
|
-
'https://coolshell.cn/',
|
23
|
-
DOWNLOAD_DIR + 'index.html',
|
24
|
-
parse_method: method(:parse_index_item)
|
25
|
-
),
|
26
|
-
max: 60
|
27
|
-
)
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
|
+
|
4
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
|
+
|
6
|
+
def parse_index_item(e)
|
7
|
+
content = File.read(e.local_path)
|
8
|
+
doc = Nokogiri::HTML(content)
|
9
|
+
list_group = doc.css('h2.entry-title')
|
10
|
+
link_list = list_group.css('a')
|
11
|
+
|
12
|
+
link_list.each do |link|
|
13
|
+
href = link['href']
|
14
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
15
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# get_one is a simple function for one taskstruct situation
|
20
|
+
ListSpider.get_one(
|
21
|
+
TaskStruct.new(
|
22
|
+
'https://coolshell.cn/',
|
23
|
+
DOWNLOAD_DIR + 'index.html',
|
24
|
+
parse_method: method(:parse_index_item)
|
25
|
+
),
|
26
|
+
max: 60
|
27
|
+
)
|
data/spider_example_2.rb
CHANGED
@@ -1,29 +1,29 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
|
3
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
4
|
-
|
5
|
-
@next_list = []
|
6
|
-
|
7
|
-
def parse_index_item(e)
|
8
|
-
content = File.read(e.local_path)
|
9
|
-
doc = Nokogiri::HTML(content)
|
10
|
-
list_group = doc.css('h2.entry-title')
|
11
|
-
link_list = list_group.css('a')
|
12
|
-
|
13
|
-
link_list.each do |link|
|
14
|
-
href = link['href']
|
15
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
16
|
-
# or you can save them to database for later use
|
17
|
-
@next_list << TaskStruct.new(href, local_path)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
task_list = []
|
22
|
-
task_list << TaskStruct.new(
|
23
|
-
'https://coolshell.cn/',
|
24
|
-
DOWNLOAD_DIR + 'index.html',
|
25
|
-
parse_method: method(:parse_index_item)
|
26
|
-
)
|
27
|
-
|
28
|
-
ListSpider.get_list(task_list)
|
29
|
-
ListSpider.get_list(@next_list, max: 60)
|
1
|
+
require 'list_spider'
|
2
|
+
|
3
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
4
|
+
|
5
|
+
@next_list = []
|
6
|
+
|
7
|
+
def parse_index_item(e)
|
8
|
+
content = File.read(e.local_path)
|
9
|
+
doc = Nokogiri::HTML(content)
|
10
|
+
list_group = doc.css('h2.entry-title')
|
11
|
+
link_list = list_group.css('a')
|
12
|
+
|
13
|
+
link_list.each do |link|
|
14
|
+
href = link['href']
|
15
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
16
|
+
# or you can save them to database for later use
|
17
|
+
@next_list << TaskStruct.new(href, local_path)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
task_list = []
|
22
|
+
task_list << TaskStruct.new(
|
23
|
+
'https://coolshell.cn/',
|
24
|
+
DOWNLOAD_DIR + 'index.html',
|
25
|
+
parse_method: method(:parse_index_item)
|
26
|
+
)
|
27
|
+
|
28
|
+
ListSpider.get_list(task_list)
|
29
|
+
ListSpider.get_list(@next_list, max: 60)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -64,14 +64,14 @@ dependencies:
|
|
64
64
|
requirements:
|
65
65
|
- - "~>"
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
-
version: '1.
|
67
|
+
version: '1.10'
|
68
68
|
type: :runtime
|
69
69
|
prerelease: false
|
70
70
|
version_requirements: !ruby/object:Gem::Requirement
|
71
71
|
requirements:
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: '1.
|
74
|
+
version: '1.10'
|
75
75
|
- !ruby/object:Gem::Dependency
|
76
76
|
name: rchardet
|
77
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,7 +136,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
136
|
- !ruby/object:Gem::Version
|
137
137
|
version: '0'
|
138
138
|
requirements: []
|
139
|
-
|
139
|
+
rubyforge_project:
|
140
|
+
rubygems_version: 2.7.6
|
140
141
|
signing_key:
|
141
142
|
specification_version: 4
|
142
143
|
summary: List Spider
|