free_spider 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/free_spider/begin.rb +131 -0
- data/lib/free_spider/downloader.rb +7 -0
- data/lib/free_spider/logger.rb +7 -0
- data/lib/free_spider/scheduler.rb +7 -0
- data/lib/free_spider/spiders.rb +25 -0
- data/lib/free_spider/storage.rb +7 -0
- data/lib/free_spider/version.rb +3 -0
- data/lib/free_spider.rb +7 -0
- metadata +94 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c449eb2916a732e30c4720816338b5588d237859
|
4
|
+
data.tar.gz: 291cee72295e6cb010214f3f1ba1c992ed86233d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c7ada15a4fdb0a3e6ae4e9e76951bb418914cb5fe1145a5f9d87e18f2afb25094c5c07a392eadaafe4988e25cf64838a6bb1613702c5a2bc88ba54a5063f38a5
|
7
|
+
data.tar.gz: 82f77838e76ac8a0e0eef7511e67b91dbc1508793cbbaab169b7a6b256c28c74f79f9ff7d28c7f00541076276bf02e99197e35d5a7bc545c9e366dc28f672421
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# encoding = utf-8
|
2
|
+
# 生成gem
|
3
|
+
# gem build free_spider.gemspec
|
4
|
+
# 安装gem
|
5
|
+
# gem install free_spider
|
6
|
+
# 程序入口
|
7
|
+
# require 'free_spider'
|
8
|
+
# spider = FreeSpider::Begin.new
|
9
|
+
# spider.plan do
|
10
|
+
# site 'http://www.dfrobot.com.cn/'
|
11
|
+
# end
|
12
|
+
# spider.crawl
|
13
|
+
#调试
|
14
|
+
# pry -Ilib -rfree_spider
|
15
|
+
# irb -Ilib -rfree_spider
|
16
|
+
|
17
|
+
require 'open-uri'
|
18
|
+
require 'nokogiri'
|
19
|
+
# require 'logger'
|
20
|
+
|
21
|
+
module FreeSpider
|
22
|
+
class Begin
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
# 找到的链接
|
26
|
+
@todo = []
|
27
|
+
# 已经访问过的链接
|
28
|
+
@visited = []
|
29
|
+
@titles = []
|
30
|
+
end
|
31
|
+
|
32
|
+
# 程序制定函数,用户选择需要抓取的网页内容
|
33
|
+
def plan(&block)
|
34
|
+
if block_given?
|
35
|
+
instance_eval(&block)
|
36
|
+
else
|
37
|
+
puts "no plan"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# 查找网页中的链接
|
42
|
+
def find_link(path)
|
43
|
+
p "find_link-------------------"
|
44
|
+
begin
|
45
|
+
crawl if path == nil
|
46
|
+
html = open(path).read
|
47
|
+
# html = open('http://www.dfrobot.com.cn/').read
|
48
|
+
# 访问过的链接放入数组
|
49
|
+
@visited << path
|
50
|
+
# p "================"
|
51
|
+
# p @visited
|
52
|
+
# p path
|
53
|
+
doc = Nokogiri::HTML(html)
|
54
|
+
# 抓取主要内容
|
55
|
+
doc.css("a").map do |href|
|
56
|
+
# 选取内容
|
57
|
+
title = href.attributes["title"]
|
58
|
+
title_content = href.attributes["title"].value unless title.nil?
|
59
|
+
# 处理链接
|
60
|
+
href = href.attributes["href"].value unless href.attributes["href"].nil?
|
61
|
+
href = @site + href unless href.include?("#{@site}")
|
62
|
+
@todo << href
|
63
|
+
@titles << title_content
|
64
|
+
end
|
65
|
+
# 去除重复链接
|
66
|
+
@todo.uniq
|
67
|
+
# 打印信息, 写入文件
|
68
|
+
puts "#{@visited}"
|
69
|
+
p @titles.uniq.compact
|
70
|
+
write_results_to_file('title_out')
|
71
|
+
crawl
|
72
|
+
rescue OpenURI::HTTPError
|
73
|
+
puts "404"
|
74
|
+
crawl
|
75
|
+
rescue RuntimeError
|
76
|
+
puts "redirection forbidden"
|
77
|
+
crawl
|
78
|
+
rescue URI::InvalidURIError
|
79
|
+
puts "bad URI"
|
80
|
+
crawl
|
81
|
+
ensure
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# 程序开始函数
|
87
|
+
def crawl
|
88
|
+
path = nil
|
89
|
+
loop do
|
90
|
+
# 选取找到的链接中的一个链接
|
91
|
+
path = @todo.shift
|
92
|
+
break if path.nil?
|
93
|
+
# 如果是访问过的链接就重新选取
|
94
|
+
break unless @visited.include?(path)
|
95
|
+
# 去掉外部链接
|
96
|
+
# 去掉特殊链接
|
97
|
+
end
|
98
|
+
if path.nil?
|
99
|
+
puts "结束"
|
100
|
+
# 输出抓取内容
|
101
|
+
# post_title
|
102
|
+
return
|
103
|
+
end
|
104
|
+
find_link(path)
|
105
|
+
end
|
106
|
+
|
107
|
+
# 需要爬取的网站首页
|
108
|
+
def site(url)
|
109
|
+
p "-----------------"
|
110
|
+
if url.empty?
|
111
|
+
puts "URL is blank"
|
112
|
+
else
|
113
|
+
@site = url
|
114
|
+
@todo << @site
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def post_title
|
119
|
+
@titles.uniq.compact
|
120
|
+
end
|
121
|
+
|
122
|
+
def write_results_to_file(file_name)
|
123
|
+
if File.exist?(file_name) || File.new(file_name, "w")
|
124
|
+
File.open(file_name, "w") do |f|
|
125
|
+
f.write(@titles.uniq.compact)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# 蜘蛛是有Scrapy用户自己定义用来解析网页并抓取制定URL返回的内容的类,每个蜘蛛都能处理一个域名或一组域名。
|
2
|
+
# 用来定义特定网站的抓取和解析规则。
|
3
|
+
|
4
|
+
module FreeSpider
|
5
|
+
module Spiders
|
6
|
+
class Parser
|
7
|
+
|
8
|
+
def initialize(url, css, attributes)
|
9
|
+
@url = url
|
10
|
+
@css = css
|
11
|
+
@attributes = attributes
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetcher
|
15
|
+
html = open(@url).read
|
16
|
+
doc = Nokogiri::HTML(html)
|
17
|
+
# 找到网页中需要的内容
|
18
|
+
doc.css("#{@css}").map do |href|
|
19
|
+
href.attributes["#{@attributes}"].value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
data/lib/free_spider.rb
ADDED
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: free_spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- free
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.6.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 10.1.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 10.1.0
|
55
|
+
description: A simple spider
|
56
|
+
email:
|
57
|
+
- 747549945@qq.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- lib/free_spider.rb
|
63
|
+
- lib/free_spider/begin.rb
|
64
|
+
- lib/free_spider/downloader.rb
|
65
|
+
- lib/free_spider/logger.rb
|
66
|
+
- lib/free_spider/scheduler.rb
|
67
|
+
- lib/free_spider/spiders.rb
|
68
|
+
- lib/free_spider/storage.rb
|
69
|
+
- lib/free_spider/version.rb
|
70
|
+
homepage: https://github.com/free1/free_spider
|
71
|
+
licenses:
|
72
|
+
- MIT
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 2.4.5
|
91
|
+
signing_key:
|
92
|
+
specification_version: 4
|
93
|
+
summary: A simple spider
|
94
|
+
test_files: []
|