readable 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/lib/readable.rb +9 -0
- data/lib/readable/rule.rb +23 -0
- data/lib/readable/rules/sina.rb +9 -0
- data/lib/readable/rules/techcrunch.rb +9 -0
- data/lib/readable/version.rb +4 -0
- data/lib/readable/webpage.rb +70 -0
- data/readable.gemspec +23 -0
- data/spec/pages/qq.html +1992 -0
- data/spec/pages/sina.html +1202 -0
- data/spec/pages/sohu.html +2311 -0
- data/spec/pages/techcrunch.html +1157 -0
- data/spec/pages/wordpress.html +1375 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/webpage_spec.rb +57 -0
- metadata +125 -0
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), '../lib/readable')
|
3
|
+
|
4
|
+
def read(filename)
|
5
|
+
file = File.open(File.join(File.dirname(__FILE__), filename), "r:binary")
|
6
|
+
content = file.read
|
7
|
+
file.close
|
8
|
+
content
|
9
|
+
end
|
10
|
+
|
11
|
+
def stub_page!(filename)
|
12
|
+
content = read(filename)
|
13
|
+
result = stub!(:result)
|
14
|
+
result.stub!(:body => content)
|
15
|
+
Net::HTTP.stub!(:start).and_return(result)
|
16
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '/spec_helper')
|
4
|
+
|
5
|
+
describe Readable::Webpage, 'Encoding' do
|
6
|
+
it "should convert html into utf-8 according to charset of webpage" do
|
7
|
+
stub_page!('pages/sohu.html')
|
8
|
+
page = Readable::Webpage.new('http://test.com')
|
9
|
+
page.html.encoding.name.downcase.should == 'utf-8'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe Readable::Webpage, "Parse" do
|
14
|
+
it "should parse sohu correctly" do
|
15
|
+
stub_page!('pages/sohu.html')
|
16
|
+
page = Readable::Webpage.new('http://sohu.com')
|
17
|
+
page.title.should == '揭秘玫瑰怒放四部曲 投射能力增强才是蜕变之源'
|
18
|
+
page.content.should be_include('没错,德里克-罗斯是从小看着迈克尔-乔丹打球长大的,在他成为2008年的选秀状元之时,他也没想过自己能这么快达到如此高的巅峰,这也出乎了所有人的意料,因为完成这一切他仅仅用了3个赛季而以,本赛季的罗斯可谓一飞冲天,即便是他赛季之前就成宣称自己要成为“MVP”,人们也并没有把这句话放在心上。')
|
19
|
+
page.content.should_not be_include('网友关注排行')
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should parse qq news correctly" do
|
23
|
+
stub_page!('pages/qq.html')
|
24
|
+
page = Readable::Webpage.new('http://qq.com')
|
25
|
+
page.title.should == '卡扎菲政府军发言人在北约空袭中死亡'
|
26
|
+
page.content.should be_include('中新网5月16日电')
|
27
|
+
page.content.should be_include('因为这些设施是卡扎菲政权维持统治的工具。')
|
28
|
+
page.content.should_not be_include('每日推荐')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should parse sina news correctly" do
|
32
|
+
stub_page!('pages/sina.html')
|
33
|
+
page = Readable::Webpage.new('http://sina.com.cn')
|
34
|
+
page.title.should == '巴基斯坦同意今日归还美绝密隐身直升机残骸'
|
35
|
+
page.content.should be_include('环球网记者')
|
36
|
+
page.content.should be_include('抵达展开准备工作')
|
37
|
+
page.content.should_not be_include('军事论坛')
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should parse techcrunch correctly" do
|
41
|
+
stub_page!('pages/techcrunch.html')
|
42
|
+
page = Readable::Webpage.new('http://techcrunch.com')
|
43
|
+
page.title.should be_include('Chrome OS 12')
|
44
|
+
page.content.should be_include('Last week, just prior to day one of Google I/O')
|
45
|
+
page.content.should be_include('what the first Chromebooks will feel like, check it out.')
|
46
|
+
page.content.should_not be_include('Jobs')
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should parse wordpress correctly" do
|
50
|
+
stub_page!('pages/wordpress.html')
|
51
|
+
page = Readable::Webpage.new('http://wordpress.com')
|
52
|
+
page.title.should be_include('My Weekend')
|
53
|
+
page.content.should be_include('It is said that Greek people')
|
54
|
+
page.content.should be_include('Kate')
|
55
|
+
page.content.should_not be_include('You ARE a Goddess')
|
56
|
+
end
|
57
|
+
end
|
metadata
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: readable
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Zhang Yuanyi
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-05-17 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :development
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: nokogiri
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: sanitize
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
59
|
+
description: Readable provides a more confortable way to read web.
|
60
|
+
email:
|
61
|
+
- zhangyuanyi@gmail.com
|
62
|
+
executables: []
|
63
|
+
|
64
|
+
extensions: []
|
65
|
+
|
66
|
+
extra_rdoc_files: []
|
67
|
+
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- Gemfile
|
71
|
+
- Rakefile
|
72
|
+
- lib/readable.rb
|
73
|
+
- lib/readable/rule.rb
|
74
|
+
- lib/readable/rules/sina.rb
|
75
|
+
- lib/readable/rules/techcrunch.rb
|
76
|
+
- lib/readable/version.rb
|
77
|
+
- lib/readable/webpage.rb
|
78
|
+
- readable.gemspec
|
79
|
+
- spec/pages/qq.html
|
80
|
+
- spec/pages/sina.html
|
81
|
+
- spec/pages/sohu.html
|
82
|
+
- spec/pages/techcrunch.html
|
83
|
+
- spec/pages/wordpress.html
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
- spec/webpage_spec.rb
|
86
|
+
has_rdoc: true
|
87
|
+
homepage: ""
|
88
|
+
licenses: []
|
89
|
+
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
segments:
|
101
|
+
- 0
|
102
|
+
version: "0"
|
103
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
|
+
none: false
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
segments:
|
109
|
+
- 0
|
110
|
+
version: "0"
|
111
|
+
requirements: []
|
112
|
+
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 1.3.7
|
115
|
+
signing_key:
|
116
|
+
specification_version: 3
|
117
|
+
summary: Readable make web content easier to read
|
118
|
+
test_files:
|
119
|
+
- spec/pages/qq.html
|
120
|
+
- spec/pages/sina.html
|
121
|
+
- spec/pages/sohu.html
|
122
|
+
- spec/pages/techcrunch.html
|
123
|
+
- spec/pages/wordpress.html
|
124
|
+
- spec/spec_helper.rb
|
125
|
+
- spec/webpage_spec.rb
|