free_spider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c449eb2916a732e30c4720816338b5588d237859
4
+ data.tar.gz: 291cee72295e6cb010214f3f1ba1c992ed86233d
5
+ SHA512:
6
+ metadata.gz: c7ada15a4fdb0a3e6ae4e9e76951bb418914cb5fe1145a5f9d87e18f2afb25094c5c07a392eadaafe4988e25cf64838a6bb1613702c5a2bc88ba54a5063f38a5
7
+ data.tar.gz: 82f77838e76ac8a0e0eef7511e67b91dbc1508793cbbaab169b7a6b256c28c74f79f9ff7d28c7f00541076276bf02e99197e35d5a7bc545c9e366dc28f672421
@@ -0,0 +1,131 @@
1
+ # encoding = utf-8
2
+ # 生成gem
3
+ # gem build free_spider.gemspec
4
+ # 安装gem
5
+ # gem install free_spider
6
+ # 程序入口
7
+ # require 'free_spider'
8
+ # spider = FreeSpider::Begin.new
9
+ # spider.plan do
10
+ # site 'http://www.dfrobot.com.cn/'
11
+ # end
12
+ # spider.crawl
13
+ #调试
14
+ # pry -Ilib -rfree_spider
15
+ # irb -Ilib -rfree_spider
16
+
17
+ require 'open-uri'
18
+ require 'nokogiri'
19
+ # require 'logger'
20
+
21
+ module FreeSpider
22
+ class Begin
23
+
24
+ def initialize
25
+ # 找到的链接
26
+ @todo = []
27
+ # 已经访问过的链接
28
+ @visited = []
29
+ @titles = []
30
+ end
31
+
32
+ # 程序制定函数,用户选择需要抓取的网页内容
33
+ def plan(&block)
34
+ if block_given?
35
+ instance_eval(&block)
36
+ else
37
+ puts "no plan"
38
+ end
39
+ end
40
+
41
+ # 查找网页中的链接
42
+ def find_link(path)
43
+ p "find_link-------------------"
44
+ begin
45
+ crawl if path == nil
46
+ html = open(path).read
47
+ # html = open('http://www.dfrobot.com.cn/').read
48
+ # 访问过的链接放入数组
49
+ @visited << path
50
+ # p "================"
51
+ # p @visited
52
+ # p path
53
+ doc = Nokogiri::HTML(html)
54
+ # 抓取主要内容
55
+ doc.css("a").map do |href|
56
+ # 选取内容
57
+ title = href.attributes["title"]
58
+ title_content = href.attributes["title"].value unless title.nil?
59
+ # 处理链接
60
+ href = href.attributes["href"].value unless href.attributes["href"].nil?
61
+ href = @site + href unless href.include?("#{@site}")
62
+ @todo << href
63
+ @titles << title_content
64
+ end
65
+ # 去除重复链接
66
+ @todo.uniq
67
+ # 打印信息, 写入文件
68
+ puts "#{@visited}"
69
+ p @titles.uniq.compact
70
+ write_results_to_file('title_out')
71
+ crawl
72
+ rescue OpenURI::HTTPError
73
+ puts "404"
74
+ crawl
75
+ rescue RuntimeError
76
+ puts "redirection forbidden"
77
+ crawl
78
+ rescue URI::InvalidURIError
79
+ puts "bad URI"
80
+ crawl
81
+ ensure
82
+
83
+ end
84
+ end
85
+
86
+ # 程序开始函数
87
+ def crawl
88
+ path = nil
89
+ loop do
90
+ # 选取找到的链接中的一个链接
91
+ path = @todo.shift
92
+ break if path.nil?
93
+ # 如果是访问过的链接就重新选取
94
+ break unless @visited.include?(path)
95
+ # 去掉外部链接
96
+ # 去掉特殊链接
97
+ end
98
+ if path.nil?
99
+ puts "结束"
100
+ # 输出抓取内容
101
+ # post_title
102
+ return
103
+ end
104
+ find_link(path)
105
+ end
106
+
107
+ # 需要爬取的网站首页
108
+ def site(url)
109
+ p "-----------------"
110
+ if url.empty?
111
+ puts "URL is blank"
112
+ else
113
+ @site = url
114
+ @todo << @site
115
+ end
116
+ end
117
+
118
+ def post_title
119
+ @titles.uniq.compact
120
+ end
121
+
122
+ def write_results_to_file(file_name)
123
+ if File.exist?(file_name) || File.new(file_name, "w")
124
+ File.open(file_name, "w") do |f|
125
+ f.write(@titles.uniq.compact)
126
+ end
127
+ end
128
+ end
129
+
130
+ end
131
+ end
@@ -0,0 +1,7 @@
1
+ # 下载器的主要职责是抓取网页并将网页内容返还给蜘蛛(Spiders)
2
+
3
+ module FreeSpider
4
+ module Downloader
5
+
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ # 打印爬取过程中出现的错误
2
+
3
+ module FreeSpider
4
+ module Logger
5
+
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ # 调度程序从Scrapy引擎接受请求并排序列入队列,并在Scrapy引擎发出请求后返还给他们
2
+
3
+ module FreeSpider
4
+ module Scheduler
5
+
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ # 蜘蛛是有Scrapy用户自己定义用来解析网页并抓取制定URL返回的内容的类,每个蜘蛛都能处理一个域名或一组域名。
2
+ # 用来定义特定网站的抓取和解析规则。
3
+
4
+ module FreeSpider
5
+ module Spiders
6
+ class Parser
7
+
8
+ def initialize(url, css, attributes)
9
+ @url = url
10
+ @css = css
11
+ @attributes = attributes
12
+ end
13
+
14
+ def fetcher
15
+ html = open(@url).read
16
+ doc = Nokogiri::HTML(html)
17
+ # 找到网页中需要的内容
18
+ doc.css("#{@css}").map do |href|
19
+ href.attributes["#{@attributes}"].value
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,7 @@
1
+ # 存储得到的数据
2
+
3
+ module FreeSpider
4
+ module Storage
5
+
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ module FreeSpider
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,7 @@
1
+ require "free_spider/version"
2
+ require "free_spider/downloader"
3
+ require "free_spider/spiders"
4
+ require "free_spider/logger"
5
+ require "free_spider/scheduler"
6
+ require "free_spider/storage"
7
+ require "free_spider/begin"
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: free_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - free
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 1.6.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 10.1.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 10.1.0
55
+ description: A simple spider
56
+ email:
57
+ - 747549945@qq.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/free_spider.rb
63
+ - lib/free_spider/begin.rb
64
+ - lib/free_spider/downloader.rb
65
+ - lib/free_spider/logger.rb
66
+ - lib/free_spider/scheduler.rb
67
+ - lib/free_spider/spiders.rb
68
+ - lib/free_spider/storage.rb
69
+ - lib/free_spider/version.rb
70
+ homepage: https://github.com/free1/free_spider
71
+ licenses:
72
+ - MIT
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.4.5
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: A simple spider
94
+ test_files: []