free_spider 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c449eb2916a732e30c4720816338b5588d237859
4
- data.tar.gz: 291cee72295e6cb010214f3f1ba1c992ed86233d
3
+ metadata.gz: 77aa733a35759e6c95a5b46afe179525b3647527
4
+ data.tar.gz: 23811d1a62d030ff1f43a4954123a3e785827227
5
5
  SHA512:
6
- metadata.gz: c7ada15a4fdb0a3e6ae4e9e76951bb418914cb5fe1145a5f9d87e18f2afb25094c5c07a392eadaafe4988e25cf64838a6bb1613702c5a2bc88ba54a5063f38a5
7
- data.tar.gz: 82f77838e76ac8a0e0eef7511e67b91dbc1508793cbbaab169b7a6b256c28c74f79f9ff7d28c7f00541076276bf02e99197e35d5a7bc545c9e366dc28f672421
6
+ metadata.gz: cbc19ec7685f189514c8ddee406d2ad0ebca43ae8902efe1702481572d5cd324cf7160ef0f49074e79986e5e54dc06f7e72638653aa38eea600c78e236fde708
7
+ data.tar.gz: fc6322bc17701135206e57da6cf1fabaf91c742cb850b17f89b3629da40a0f03a23813d89646adc1883c65cb310ee88193484b2caff901c5ac88dfb08bdd1bbe
@@ -1,21 +1,29 @@
1
1
  # encoding = utf-8
2
+
2
3
  # 生成gem
3
4
  # gem build free_spider.gemspec
4
5
  # 安装gem
5
6
  # gem install free_spider
7
+ # 提交gem
8
+ # gem push free_spider-0.0.1.gem
9
+
6
10
  # 程序入口
7
11
  # require 'free_spider'
8
12
  # spider = FreeSpider::Begin.new
9
13
  # spider.plan do
10
- # site 'http://www.dfrobot.com.cn/'
14
+ # site 'http://oszine.com/'
11
15
  # end
12
16
  # spider.crawl
13
- #调试
17
+
18
+ # 调试
14
19
  # pry -Ilib -rfree_spider
15
20
  # irb -Ilib -rfree_spider
16
21
 
22
+ # coending = utf-8
17
23
  require 'open-uri'
18
24
  require 'nokogiri'
25
+ # require 'active_record'
26
+ # require 'mysql2'
19
27
  # require 'logger'
20
28
 
21
29
  module FreeSpider
@@ -26,7 +34,10 @@ module FreeSpider
26
34
  @todo = []
27
35
  # 已经访问过的链接
28
36
  @visited = []
29
- @titles = []
37
+ # 暂时存放内容
38
+ @news_teaching_content = {}
39
+ # 文章题目(判断是否重复)
40
+ @title_saved = []
30
41
  end
31
42
 
32
43
  # 程序制定函数,用户选择需要抓取的网页内容
@@ -40,7 +51,7 @@ module FreeSpider
40
51
 
41
52
  # 查找网页中的链接
42
53
  def find_link(path)
43
- p "find_link-------------------"
54
+ puts "--------find_link--------"
44
55
  begin
45
56
  crawl if path == nil
46
57
  html = open(path).read
@@ -51,23 +62,52 @@ module FreeSpider
51
62
  # p @visited
52
63
  # p path
53
64
  doc = Nokogiri::HTML(html)
54
- # 抓取主要内容
65
+ # 抓取链接加入爬取队列
55
66
  doc.css("a").map do |href|
56
67
  # 选取内容
57
- title = href.attributes["title"]
58
- title_content = href.attributes["title"].value unless title.nil?
68
+ # title = href.attributes["title"]
69
+ # title_content = href.attributes["title"].value unless title.nil?
59
70
  # 处理链接
60
71
  href = href.attributes["href"].value unless href.attributes["href"].nil?
72
+ # 去除重复链接
61
73
  href = @site + href unless href.include?("#{@site}")
74
+
75
+ # 加入爬取队列
62
76
  @todo << href
63
- @titles << title_content
64
77
  end
78
+
79
+ # 抓取主要内容
80
+ unless doc.at_css(".entry-content").nil?
81
+ entry_title = doc.css(".entry-title").children.to_html
82
+ unless @title_saved.include?(entry_title)
83
+ @title_saved << entry_title
84
+ content = doc.css(".entry-content").children.to_html
85
+ @news_teaching_content = {title: entry_title, content: content}
86
+
87
+ # # 文章题目
88
+ # doc.css(".entry-title").each do |entry_title|
89
+ # title = entry_title.children.to_html unless entry_title.nil?
90
+ # news_teaching_content_tmp = {title: title}
91
+ # end
92
+ # # 放入将存入的内容
93
+ # doc.css(".entry-content").each do |entry_content|
94
+ # content = entry_content.children.to_html unless entry_content.nil?
95
+ # news_teaching_content_tmp.merge!({content: content})
96
+ # end
97
+ # p "--------news_entry--------"
98
+ # p news_teaching_content_tmp
99
+ # @news_teaching_content = news_teaching_content_tmp
100
+ end
101
+ end
102
+
103
+
65
104
  # 去除重复链接
66
- @todo.uniq
67
- # 打印信息, 写入文件
68
- puts "#{@visited}"
69
- p @titles.uniq.compact
70
- write_results_to_file('title_out')
105
+ # @todo.uniq
106
+ # 打印信息, 写入文件or数据库
107
+ # puts "#{@visited}"
108
+ # p @titles.uniq.compact
109
+ write_results_to_database
110
+ # write_results_to_file('title_out')
71
111
  crawl
72
112
  rescue OpenURI::HTTPError
73
113
  puts "404"
@@ -106,7 +146,7 @@ module FreeSpider
106
146
 
107
147
  # 需要爬取的网站首页
108
148
  def site(url)
109
- p "-----------------"
149
+ puts "--------Ready---------"
110
150
  if url.empty?
111
151
  puts "URL is blank"
112
152
  else
@@ -115,10 +155,21 @@ module FreeSpider
115
155
  end
116
156
  end
117
157
 
118
- def post_title
119
- @titles.uniq.compact
158
+ # 写入mysql
159
+ def write_results_to_database
160
+ news_teaching = FreeSpider::Downloader::NewsTeaching.new(@news_teaching_content)
161
+ if news_teaching.save
162
+ puts "--------save success!--------"
163
+ else
164
+ puts "--------save error!--------"
165
+ end
120
166
  end
121
167
 
168
+ # def post_title
169
+ # @titles.uniq.compact
170
+ # end
171
+
172
+ # 写入文件
122
173
  def write_results_to_file(file_name)
123
174
  if File.exist?(file_name) || File.new(file_name, "w")
124
175
  File.open(file_name, "w") do |f|
@@ -1,7 +1,36 @@
1
1
  # 下载器的主要职责是抓取网页并将网页内容返还给蜘蛛(Spiders)
2
2
 
3
+ require 'active_record'
4
+ require 'mysql2'
5
+
3
6
  module FreeSpider
4
7
  module Downloader
8
+ ActiveRecord::Base.logger = Logger.new(STDERR)
9
+
10
+ # 链接数据库
11
+ puts "----database_connection-----"
12
+ ActiveRecord::Base.establish_connection(
13
+ adapter: 'mysql2',
14
+ host: 'localhost',
15
+ database: 'chuangkejiazu',
16
+ username: 'root',
17
+ password: '123'
18
+ )
19
+
20
+ # 创建表结构
21
+ puts "----table_create-----"
22
+ ActiveRecord::Schema.define do
23
+ unless ActiveRecord::Base.connection.tables.include? 'news_teachings'
24
+ create_table :news_teachings do |table|
25
+ table.column :title, :string
26
+ table.column :content, :text
27
+ end
28
+ end
29
+ end
30
+
31
+ class NewsTeaching < ActiveRecord::Base
32
+ validates_presence_of :title, :content
33
+ end
5
34
 
6
35
  end
7
36
  end
@@ -1,3 +1,3 @@
1
1
  module FreeSpider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: free_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - free
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-08 00:00:00.000000000 Z
11
+ date: 2015-04-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: mysql2
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.13
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.13
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement