free_spider 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/free_spider/begin.rb +67 -16
- data/lib/free_spider/downloader.rb +29 -0
- data/lib/free_spider/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77aa733a35759e6c95a5b46afe179525b3647527
|
4
|
+
data.tar.gz: 23811d1a62d030ff1f43a4954123a3e785827227
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cbc19ec7685f189514c8ddee406d2ad0ebca43ae8902efe1702481572d5cd324cf7160ef0f49074e79986e5e54dc06f7e72638653aa38eea600c78e236fde708
|
7
|
+
data.tar.gz: fc6322bc17701135206e57da6cf1fabaf91c742cb850b17f89b3629da40a0f03a23813d89646adc1883c65cb310ee88193484b2caff901c5ac88dfb08bdd1bbe
|
data/lib/free_spider/begin.rb
CHANGED
@@ -1,21 +1,29 @@
|
|
1
1
|
# encoding = utf-8
|
2
|
+
|
2
3
|
# 生成gem
|
3
4
|
# gem build free_spider.gemspec
|
4
5
|
# 安装gem
|
5
6
|
# gem install free_spider
|
7
|
+
# 提交gem
|
8
|
+
# gem push free_spider-0.0.1.gem
|
9
|
+
|
6
10
|
# 程序入口
|
7
11
|
# require 'free_spider'
|
8
12
|
# spider = FreeSpider::Begin.new
|
9
13
|
# spider.plan do
|
10
|
-
# site 'http://
|
14
|
+
# site 'http://oszine.com/'
|
11
15
|
# end
|
12
16
|
# spider.crawl
|
13
|
-
|
17
|
+
|
18
|
+
# 调试
|
14
19
|
# pry -Ilib -rfree_spider
|
15
20
|
# irb -Ilib -rfree_spider
|
16
21
|
|
22
|
+
# coending = utf-8
|
17
23
|
require 'open-uri'
|
18
24
|
require 'nokogiri'
|
25
|
+
# require 'active_record'
|
26
|
+
# require 'mysql2'
|
19
27
|
# require 'logger'
|
20
28
|
|
21
29
|
module FreeSpider
|
@@ -26,7 +34,10 @@ module FreeSpider
|
|
26
34
|
@todo = []
|
27
35
|
# 已经访问过的链接
|
28
36
|
@visited = []
|
29
|
-
|
37
|
+
# 暂时存放内容
|
38
|
+
@news_teaching_content = {}
|
39
|
+
# 文章题目(判断是否重复)
|
40
|
+
@title_saved = []
|
30
41
|
end
|
31
42
|
|
32
43
|
# 程序制定函数,用户选择需要抓取的网页内容
|
@@ -40,7 +51,7 @@ module FreeSpider
|
|
40
51
|
|
41
52
|
# 查找网页中的链接
|
42
53
|
def find_link(path)
|
43
|
-
|
54
|
+
puts "--------find_link--------"
|
44
55
|
begin
|
45
56
|
crawl if path == nil
|
46
57
|
html = open(path).read
|
@@ -51,23 +62,52 @@ module FreeSpider
|
|
51
62
|
# p @visited
|
52
63
|
# p path
|
53
64
|
doc = Nokogiri::HTML(html)
|
54
|
-
#
|
65
|
+
# 抓取链接加入爬取队列
|
55
66
|
doc.css("a").map do |href|
|
56
67
|
# 选取内容
|
57
|
-
title = href.attributes["title"]
|
58
|
-
title_content = href.attributes["title"].value unless title.nil?
|
68
|
+
# title = href.attributes["title"]
|
69
|
+
# title_content = href.attributes["title"].value unless title.nil?
|
59
70
|
# 处理链接
|
60
71
|
href = href.attributes["href"].value unless href.attributes["href"].nil?
|
72
|
+
# 去除重复链接
|
61
73
|
href = @site + href unless href.include?("#{@site}")
|
74
|
+
|
75
|
+
# 加入爬取队列
|
62
76
|
@todo << href
|
63
|
-
@titles << title_content
|
64
77
|
end
|
78
|
+
|
79
|
+
# 抓取主要内容
|
80
|
+
unless doc.at_css(".entry-content").nil?
|
81
|
+
entry_title = doc.css(".entry-title").children.to_html
|
82
|
+
unless @title_saved.include?(entry_title)
|
83
|
+
@title_saved << entry_title
|
84
|
+
content = doc.css(".entry-content").children.to_html
|
85
|
+
@news_teaching_content = {title: entry_title, content: content}
|
86
|
+
|
87
|
+
# # 文章题目
|
88
|
+
# doc.css(".entry-title").each do |entry_title|
|
89
|
+
# title = entry_title.children.to_html unless entry_title.nil?
|
90
|
+
# news_teaching_content_tmp = {title: title}
|
91
|
+
# end
|
92
|
+
# # 放入将存入的内容
|
93
|
+
# doc.css(".entry-content").each do |entry_content|
|
94
|
+
# content = entry_content.children.to_html unless entry_content.nil?
|
95
|
+
# news_teaching_content_tmp.merge!({content: content})
|
96
|
+
# end
|
97
|
+
# p "--------news_entry--------"
|
98
|
+
# p news_teaching_content_tmp
|
99
|
+
# @news_teaching_content = news_teaching_content_tmp
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
|
65
104
|
# 去除重复链接
|
66
|
-
@todo.uniq
|
67
|
-
# 打印信息, 写入文件
|
68
|
-
puts "#{@visited}"
|
69
|
-
p @titles.uniq.compact
|
70
|
-
|
105
|
+
# @todo.uniq
|
106
|
+
# 打印信息, 写入文件or数据库
|
107
|
+
# puts "#{@visited}"
|
108
|
+
# p @titles.uniq.compact
|
109
|
+
write_results_to_database
|
110
|
+
# write_results_to_file('title_out')
|
71
111
|
crawl
|
72
112
|
rescue OpenURI::HTTPError
|
73
113
|
puts "404"
|
@@ -106,7 +146,7 @@ module FreeSpider
|
|
106
146
|
|
107
147
|
# 需要爬取的网站首页
|
108
148
|
def site(url)
|
109
|
-
|
149
|
+
puts "--------Ready---------"
|
110
150
|
if url.empty?
|
111
151
|
puts "URL is blank"
|
112
152
|
else
|
@@ -115,10 +155,21 @@ module FreeSpider
|
|
115
155
|
end
|
116
156
|
end
|
117
157
|
|
118
|
-
|
119
|
-
|
158
|
+
# 写入mysql
|
159
|
+
def write_results_to_database
|
160
|
+
news_teaching = FreeSpider::Downloader::NewsTeaching.new(@news_teaching_content)
|
161
|
+
if news_teaching.save
|
162
|
+
puts "--------save success!--------"
|
163
|
+
else
|
164
|
+
puts "--------save error!--------"
|
165
|
+
end
|
120
166
|
end
|
121
167
|
|
168
|
+
# def post_title
|
169
|
+
# @titles.uniq.compact
|
170
|
+
# end
|
171
|
+
|
172
|
+
# 写入文件
|
122
173
|
def write_results_to_file(file_name)
|
123
174
|
if File.exist?(file_name) || File.new(file_name, "w")
|
124
175
|
File.open(file_name, "w") do |f|
|
@@ -1,7 +1,36 @@
|
|
1
1
|
# 下载器的主要职责是抓取网页并将网页内容返还给蜘蛛(Spiders)
|
2
2
|
|
3
|
+
require 'active_record'
|
4
|
+
require 'mysql2'
|
5
|
+
|
3
6
|
module FreeSpider
|
4
7
|
module Downloader
|
8
|
+
ActiveRecord::Base.logger = Logger.new(STDERR)
|
9
|
+
|
10
|
+
# 链接数据库
|
11
|
+
puts "----database_connection-----"
|
12
|
+
ActiveRecord::Base.establish_connection(
|
13
|
+
adapter: 'mysql2',
|
14
|
+
host: 'localhost',
|
15
|
+
database: 'chuangkejiazu',
|
16
|
+
username: 'root',
|
17
|
+
password: '123'
|
18
|
+
)
|
19
|
+
|
20
|
+
# 创建表结构
|
21
|
+
puts "----table_create-----"
|
22
|
+
ActiveRecord::Schema.define do
|
23
|
+
unless ActiveRecord::Base.connection.tables.include? 'news_teachings'
|
24
|
+
create_table :news_teachings do |table|
|
25
|
+
table.column :title, :string
|
26
|
+
table.column :content, :text
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class NewsTeaching < ActiveRecord::Base
|
32
|
+
validates_presence_of :title, :content
|
33
|
+
end
|
5
34
|
|
6
35
|
end
|
7
36
|
end
|
data/lib/free_spider/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: free_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- free
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mysql2
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.13
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.13
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|