free_spider 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/free_spider/begin.rb +67 -16
- data/lib/free_spider/downloader.rb +29 -0
- data/lib/free_spider/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77aa733a35759e6c95a5b46afe179525b3647527
|
4
|
+
data.tar.gz: 23811d1a62d030ff1f43a4954123a3e785827227
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cbc19ec7685f189514c8ddee406d2ad0ebca43ae8902efe1702481572d5cd324cf7160ef0f49074e79986e5e54dc06f7e72638653aa38eea600c78e236fde708
|
7
|
+
data.tar.gz: fc6322bc17701135206e57da6cf1fabaf91c742cb850b17f89b3629da40a0f03a23813d89646adc1883c65cb310ee88193484b2caff901c5ac88dfb08bdd1bbe
|
data/lib/free_spider/begin.rb
CHANGED
@@ -1,21 +1,29 @@
|
|
1
1
|
# encoding = utf-8
|
2
|
+
|
2
3
|
# 生成gem
|
3
4
|
# gem build free_spider.gemspec
|
4
5
|
# 安装gem
|
5
6
|
# gem install free_spider
|
7
|
+
# 提交gem
|
8
|
+
# gem push free_spider-0.0.1.gem
|
9
|
+
|
6
10
|
# 程序入口
|
7
11
|
# require 'free_spider'
|
8
12
|
# spider = FreeSpider::Begin.new
|
9
13
|
# spider.plan do
|
10
|
-
# site 'http://
|
14
|
+
# site 'http://oszine.com/'
|
11
15
|
# end
|
12
16
|
# spider.crawl
|
13
|
-
|
17
|
+
|
18
|
+
# 调试
|
14
19
|
# pry -Ilib -rfree_spider
|
15
20
|
# irb -Ilib -rfree_spider
|
16
21
|
|
22
|
+
# coending = utf-8
|
17
23
|
require 'open-uri'
|
18
24
|
require 'nokogiri'
|
25
|
+
# require 'active_record'
|
26
|
+
# require 'mysql2'
|
19
27
|
# require 'logger'
|
20
28
|
|
21
29
|
module FreeSpider
|
@@ -26,7 +34,10 @@ module FreeSpider
|
|
26
34
|
@todo = []
|
27
35
|
# 已经访问过的链接
|
28
36
|
@visited = []
|
29
|
-
|
37
|
+
# 暂时存放内容
|
38
|
+
@news_teaching_content = {}
|
39
|
+
# 文章题目(判断是否重复)
|
40
|
+
@title_saved = []
|
30
41
|
end
|
31
42
|
|
32
43
|
# 程序制定函数,用户选择需要抓取的网页内容
|
@@ -40,7 +51,7 @@ module FreeSpider
|
|
40
51
|
|
41
52
|
# 查找网页中的链接
|
42
53
|
def find_link(path)
|
43
|
-
|
54
|
+
puts "--------find_link--------"
|
44
55
|
begin
|
45
56
|
crawl if path == nil
|
46
57
|
html = open(path).read
|
@@ -51,23 +62,52 @@ module FreeSpider
|
|
51
62
|
# p @visited
|
52
63
|
# p path
|
53
64
|
doc = Nokogiri::HTML(html)
|
54
|
-
#
|
65
|
+
# 抓取链接加入爬取队列
|
55
66
|
doc.css("a").map do |href|
|
56
67
|
# 选取内容
|
57
|
-
title = href.attributes["title"]
|
58
|
-
title_content = href.attributes["title"].value unless title.nil?
|
68
|
+
# title = href.attributes["title"]
|
69
|
+
# title_content = href.attributes["title"].value unless title.nil?
|
59
70
|
# 处理链接
|
60
71
|
href = href.attributes["href"].value unless href.attributes["href"].nil?
|
72
|
+
# 去除重复链接
|
61
73
|
href = @site + href unless href.include?("#{@site}")
|
74
|
+
|
75
|
+
# 加入爬取队列
|
62
76
|
@todo << href
|
63
|
-
@titles << title_content
|
64
77
|
end
|
78
|
+
|
79
|
+
# 抓取主要内容
|
80
|
+
unless doc.at_css(".entry-content").nil?
|
81
|
+
entry_title = doc.css(".entry-title").children.to_html
|
82
|
+
unless @title_saved.include?(entry_title)
|
83
|
+
@title_saved << entry_title
|
84
|
+
content = doc.css(".entry-content").children.to_html
|
85
|
+
@news_teaching_content = {title: entry_title, content: content}
|
86
|
+
|
87
|
+
# # 文章题目
|
88
|
+
# doc.css(".entry-title").each do |entry_title|
|
89
|
+
# title = entry_title.children.to_html unless entry_title.nil?
|
90
|
+
# news_teaching_content_tmp = {title: title}
|
91
|
+
# end
|
92
|
+
# # 放入将存入的内容
|
93
|
+
# doc.css(".entry-content").each do |entry_content|
|
94
|
+
# content = entry_content.children.to_html unless entry_content.nil?
|
95
|
+
# news_teaching_content_tmp.merge!({content: content})
|
96
|
+
# end
|
97
|
+
# p "--------news_entry--------"
|
98
|
+
# p news_teaching_content_tmp
|
99
|
+
# @news_teaching_content = news_teaching_content_tmp
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
|
65
104
|
# 去除重复链接
|
66
|
-
@todo.uniq
|
67
|
-
# 打印信息, 写入文件
|
68
|
-
puts "#{@visited}"
|
69
|
-
p @titles.uniq.compact
|
70
|
-
|
105
|
+
# @todo.uniq
|
106
|
+
# 打印信息, 写入文件or数据库
|
107
|
+
# puts "#{@visited}"
|
108
|
+
# p @titles.uniq.compact
|
109
|
+
write_results_to_database
|
110
|
+
# write_results_to_file('title_out')
|
71
111
|
crawl
|
72
112
|
rescue OpenURI::HTTPError
|
73
113
|
puts "404"
|
@@ -106,7 +146,7 @@ module FreeSpider
|
|
106
146
|
|
107
147
|
# 需要爬取的网站首页
|
108
148
|
def site(url)
|
109
|
-
|
149
|
+
puts "--------Ready---------"
|
110
150
|
if url.empty?
|
111
151
|
puts "URL is blank"
|
112
152
|
else
|
@@ -115,10 +155,21 @@ module FreeSpider
|
|
115
155
|
end
|
116
156
|
end
|
117
157
|
|
118
|
-
|
119
|
-
|
158
|
+
# 写入mysql
|
159
|
+
def write_results_to_database
|
160
|
+
news_teaching = FreeSpider::Downloader::NewsTeaching.new(@news_teaching_content)
|
161
|
+
if news_teaching.save
|
162
|
+
puts "--------save success!--------"
|
163
|
+
else
|
164
|
+
puts "--------save error!--------"
|
165
|
+
end
|
120
166
|
end
|
121
167
|
|
168
|
+
# def post_title
|
169
|
+
# @titles.uniq.compact
|
170
|
+
# end
|
171
|
+
|
172
|
+
# 写入文件
|
122
173
|
def write_results_to_file(file_name)
|
123
174
|
if File.exist?(file_name) || File.new(file_name, "w")
|
124
175
|
File.open(file_name, "w") do |f|
|
@@ -1,7 +1,36 @@
|
|
1
1
|
# 下载器的主要职责是抓取网页并将网页内容返还给蜘蛛(Spiders)
|
2
2
|
|
3
|
+
require 'active_record'
|
4
|
+
require 'mysql2'
|
5
|
+
|
3
6
|
module FreeSpider
|
4
7
|
module Downloader
|
8
|
+
ActiveRecord::Base.logger = Logger.new(STDERR)
|
9
|
+
|
10
|
+
# 链接数据库
|
11
|
+
puts "----database_connection-----"
|
12
|
+
ActiveRecord::Base.establish_connection(
|
13
|
+
adapter: 'mysql2',
|
14
|
+
host: 'localhost',
|
15
|
+
database: 'chuangkejiazu',
|
16
|
+
username: 'root',
|
17
|
+
password: '123'
|
18
|
+
)
|
19
|
+
|
20
|
+
# 创建表结构
|
21
|
+
puts "----table_create-----"
|
22
|
+
ActiveRecord::Schema.define do
|
23
|
+
unless ActiveRecord::Base.connection.tables.include? 'news_teachings'
|
24
|
+
create_table :news_teachings do |table|
|
25
|
+
table.column :title, :string
|
26
|
+
table.column :content, :text
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class NewsTeaching < ActiveRecord::Base
|
32
|
+
validates_presence_of :title, :content
|
33
|
+
end
|
5
34
|
|
6
35
|
end
|
7
36
|
end
|
data/lib/free_spider/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: free_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- free
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mysql2
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.13
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.13
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|