free_spider 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c449eb2916a732e30c4720816338b5588d237859
4
- data.tar.gz: 291cee72295e6cb010214f3f1ba1c992ed86233d
3
+ metadata.gz: 77aa733a35759e6c95a5b46afe179525b3647527
4
+ data.tar.gz: 23811d1a62d030ff1f43a4954123a3e785827227
5
5
  SHA512:
6
- metadata.gz: c7ada15a4fdb0a3e6ae4e9e76951bb418914cb5fe1145a5f9d87e18f2afb25094c5c07a392eadaafe4988e25cf64838a6bb1613702c5a2bc88ba54a5063f38a5
7
- data.tar.gz: 82f77838e76ac8a0e0eef7511e67b91dbc1508793cbbaab169b7a6b256c28c74f79f9ff7d28c7f00541076276bf02e99197e35d5a7bc545c9e366dc28f672421
6
+ metadata.gz: cbc19ec7685f189514c8ddee406d2ad0ebca43ae8902efe1702481572d5cd324cf7160ef0f49074e79986e5e54dc06f7e72638653aa38eea600c78e236fde708
7
+ data.tar.gz: fc6322bc17701135206e57da6cf1fabaf91c742cb850b17f89b3629da40a0f03a23813d89646adc1883c65cb310ee88193484b2caff901c5ac88dfb08bdd1bbe
@@ -1,21 +1,29 @@
1
1
  # encoding = utf-8
2
+
2
3
  # 生成gem
3
4
  # gem build free_spider.gemspec
4
5
  # 安装gem
5
6
  # gem install free_spider
7
+ # 提交gem
8
+ # gem push free_spider-0.0.1.gem
9
+
6
10
  # 程序入口
7
11
  # require 'free_spider'
8
12
  # spider = FreeSpider::Begin.new
9
13
  # spider.plan do
10
- # site 'http://www.dfrobot.com.cn/'
14
+ # site 'http://oszine.com/'
11
15
  # end
12
16
  # spider.crawl
13
- #调试
17
+
18
+ # 调试
14
19
  # pry -Ilib -rfree_spider
15
20
  # irb -Ilib -rfree_spider
16
21
 
22
+ # coending = utf-8
17
23
  require 'open-uri'
18
24
  require 'nokogiri'
25
+ # require 'active_record'
26
+ # require 'mysql2'
19
27
  # require 'logger'
20
28
 
21
29
  module FreeSpider
@@ -26,7 +34,10 @@ module FreeSpider
26
34
  @todo = []
27
35
  # 已经访问过的链接
28
36
  @visited = []
29
- @titles = []
37
+ # 暂时存放内容
38
+ @news_teaching_content = {}
39
+ # 文章题目(判断是否重复)
40
+ @title_saved = []
30
41
  end
31
42
 
32
43
  # 程序制定函数,用户选择需要抓取的网页内容
@@ -40,7 +51,7 @@ module FreeSpider
40
51
 
41
52
  # 查找网页中的链接
42
53
  def find_link(path)
43
- p "find_link-------------------"
54
+ puts "--------find_link--------"
44
55
  begin
45
56
  crawl if path == nil
46
57
  html = open(path).read
@@ -51,23 +62,52 @@ module FreeSpider
51
62
  # p @visited
52
63
  # p path
53
64
  doc = Nokogiri::HTML(html)
54
- # 抓取主要内容
65
+ # 抓取链接加入爬取队列
55
66
  doc.css("a").map do |href|
56
67
  # 选取内容
57
- title = href.attributes["title"]
58
- title_content = href.attributes["title"].value unless title.nil?
68
+ # title = href.attributes["title"]
69
+ # title_content = href.attributes["title"].value unless title.nil?
59
70
  # 处理链接
60
71
  href = href.attributes["href"].value unless href.attributes["href"].nil?
72
+ # 去除重复链接
61
73
  href = @site + href unless href.include?("#{@site}")
74
+
75
+ # 加入爬取队列
62
76
  @todo << href
63
- @titles << title_content
64
77
  end
78
+
79
+ # 抓取主要内容
80
+ unless doc.at_css(".entry-content").nil?
81
+ entry_title = doc.css(".entry-title").children.to_html
82
+ unless @title_saved.include?(entry_title)
83
+ @title_saved << entry_title
84
+ content = doc.css(".entry-content").children.to_html
85
+ @news_teaching_content = {title: entry_title, content: content}
86
+
87
+ # # 文章题目
88
+ # doc.css(".entry-title").each do |entry_title|
89
+ # title = entry_title.children.to_html unless entry_title.nil?
90
+ # news_teaching_content_tmp = {title: title}
91
+ # end
92
+ # # 放入将存入的内容
93
+ # doc.css(".entry-content").each do |entry_content|
94
+ # content = entry_content.children.to_html unless entry_content.nil?
95
+ # news_teaching_content_tmp.merge!({content: content})
96
+ # end
97
+ # p "--------news_entry--------"
98
+ # p news_teaching_content_tmp
99
+ # @news_teaching_content = news_teaching_content_tmp
100
+ end
101
+ end
102
+
103
+
65
104
  # 去除重复链接
66
- @todo.uniq
67
- # 打印信息, 写入文件
68
- puts "#{@visited}"
69
- p @titles.uniq.compact
70
- write_results_to_file('title_out')
105
+ # @todo.uniq
106
+ # 打印信息, 写入文件or数据库
107
+ # puts "#{@visited}"
108
+ # p @titles.uniq.compact
109
+ write_results_to_database
110
+ # write_results_to_file('title_out')
71
111
  crawl
72
112
  rescue OpenURI::HTTPError
73
113
  puts "404"
@@ -106,7 +146,7 @@ module FreeSpider
106
146
 
107
147
  # 需要爬取的网站首页
108
148
  def site(url)
109
- p "-----------------"
149
+ puts "--------Ready---------"
110
150
  if url.empty?
111
151
  puts "URL is blank"
112
152
  else
@@ -115,10 +155,21 @@ module FreeSpider
115
155
  end
116
156
  end
117
157
 
118
- def post_title
119
- @titles.uniq.compact
158
+ # 写入mysql
159
+ def write_results_to_database
160
+ news_teaching = FreeSpider::Downloader::NewsTeaching.new(@news_teaching_content)
161
+ if news_teaching.save
162
+ puts "--------save success!--------"
163
+ else
164
+ puts "--------save error!--------"
165
+ end
120
166
  end
121
167
 
168
+ # def post_title
169
+ # @titles.uniq.compact
170
+ # end
171
+
172
+ # 写入文件
122
173
  def write_results_to_file(file_name)
123
174
  if File.exist?(file_name) || File.new(file_name, "w")
124
175
  File.open(file_name, "w") do |f|
@@ -1,7 +1,36 @@
1
1
  # 下载器的主要职责是抓取网页并将网页内容返还给蜘蛛(Spiders)
2
2
 
3
+ require 'active_record'
4
+ require 'mysql2'
5
+
3
6
  module FreeSpider
4
7
  module Downloader
8
+ ActiveRecord::Base.logger = Logger.new(STDERR)
9
+
10
+ # 链接数据库
11
+ puts "----database_connection-----"
12
+ ActiveRecord::Base.establish_connection(
13
+ adapter: 'mysql2',
14
+ host: 'localhost',
15
+ database: 'chuangkejiazu',
16
+ username: 'root',
17
+ password: '123'
18
+ )
19
+
20
+ # 创建表结构
21
+ puts "----table_create-----"
22
+ ActiveRecord::Schema.define do
23
+ unless ActiveRecord::Base.connection.tables.include? 'news_teachings'
24
+ create_table :news_teachings do |table|
25
+ table.column :title, :string
26
+ table.column :content, :text
27
+ end
28
+ end
29
+ end
30
+
31
+ class NewsTeaching < ActiveRecord::Base
32
+ validates_presence_of :title, :content
33
+ end
5
34
 
6
35
  end
7
36
  end
@@ -1,3 +1,3 @@
1
1
  module FreeSpider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: free_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - free
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-08 00:00:00.000000000 Z
11
+ date: 2015-04-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: mysql2
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.13
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.13
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement