web-page-parser 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
+ require 'web-page-parser'
3
+ include WebPageParser
4
+
5
+ describe ParserFactory do
6
+
7
+ it "should load parsers in the parsers directory" do
8
+ ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
9
+ end
10
+
11
+ it "should provide the right PageParser for the given url" do
12
+ ParserFactory.parser_for(:url => "http://www.example.com").should be_a_kind_of TestPageParser
13
+ end
14
+
15
+ it "should return nil if no PageParser can be found for the given url" do
16
+ ParserFactory.parser_for(:url => "http://www.nowhere.nodomain").should be_nil
17
+ end
18
+ end
@@ -0,0 +1,144 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../../lib')
2
+ require 'spec/base_parser_spec'
3
+ require 'web-page-parser'
4
+ include WebPageParser
5
+
6
+ describe BbcNewsPageParserFactory do
7
+ before do
8
+ @valid_urls = [
9
+ "http://news.bbc.co.uk/1/hi/entertainment/6984082.stm",
10
+ "http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
11
+ "http://news.bbc.co.uk/1/hi/uk/7995652.stm",
12
+ "http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
13
+ "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
14
+ ]
15
+ @invalid_urls = [
16
+ "http://news.bbc.co.uk/2/hi/health/default.stm",
17
+ "http://news.bbc.co.uk/2/low/europe/default.stm",
18
+ "http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
19
+ "http://news.bbc.co.uk/sport",
20
+ "http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
21
+ "http://www.bbc.co.uk/blogs/nickrobinson/",
22
+ "http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
23
+ "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
24
+ ]
25
+ end
26
+
27
+ it "should detect bbc news articles from the url" do
28
+ @valid_urls.each do |url|
29
+ BbcNewsPageParserFactory.can_parse?(:url => url).should be_true
30
+ end
31
+ end
32
+
33
+ it "should ignore pages with the wrong url format" do
34
+ @invalid_urls.each do |url|
35
+ BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
36
+ end
37
+ end
38
+
39
+ it "should ignore 'in pictures' articles" do
40
+ BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
41
+ end
42
+ end
43
+
44
+ describe BbcNewsPageParserV2 do
45
+ it_should_behave_like AllPageParsers
46
+ before do
47
+ @valid_options = {
48
+ :url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
49
+ :page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
50
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
51
+ }
52
+ @pa = BbcNewsPageParserV2.new(@valid_options)
53
+ end
54
+
55
+ it "should parse the title" do
56
+ @pa.title.should == "Obama invites Middle East heads"
57
+ end
58
+
59
+ it "should convert iso-8859-1 in the title to utf8" do
60
+ page = BbcNewsPageParserV2.new(:page => '<meta name="Headline" content="'+"\243"+'100K mortgage claim by Woodward"')
61
+ page.title.should == "£100K mortgage claim by Woodward"
62
+ end
63
+
64
+ it "should convert iso-8859-1 in the content to utf8" do
65
+ page = BbcNewsPageParserV2.new(:page => "S BO -->\243100K mortgage claim by Woodward<!-- E BO")
66
+ page.content.first.should == "£100K mortgage claim by Woodward"
67
+ end
68
+
69
+
70
+ it "should parse the date in UTC" do
71
+ # 2009/04/21 19:50:44
72
+ @pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
73
+ @pa.date.zone.should == '+00:00'
74
+ end
75
+
76
+ it "should parse the content" do
77
+ @pa.content[0].should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
78
+ @pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
79
+ @pa.content.size.should == 15
80
+ end
81
+
82
+ it "should decode html entities" do
83
+ @pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
84
+ end
85
+
86
+ it "should calculate a valid hash of the content" do
87
+ @pa.hash.should == @valid_options[:valid_hash]
88
+ end
89
+
90
+ it "should parse 'from our own correspondent' pages" do
91
+ page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/programmes/from_our_own_correspondent/8029015.stm",
92
+ :page => File.read("spec/fixtures/bbc_news/8029015.stm.html"))
93
+ page.title.should == "Cairo's terrifying traffic chaos"
94
+ page.content.first.should == "Christian Fraser discovers that a brush with death on Cairo's congested roads leaves no appetite for life in the fast lane."
95
+ end
96
+
97
+ it "should parse 'magazine' pages" do
98
+ page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/magazine/8063681.stm",
99
+ :page => File.read("spec/fixtures/bbc_news/8063681.stm.html"))
100
+ page.title.should == "My night with Parisien prostitutes"
101
+ page.content.first.should == "Wandering around the red light district of Paris as a teenager taught me all I need to know - about teenagers, not women, says Laurie Taylor in his weekly column."
102
+ end
103
+
104
+ end
105
+
106
+ describe BbcNewsPageParserV1 do
107
+ before do
108
+ @valid_options = {
109
+ :url => 'http://news.bbc.co.uk/1/hi/england/bradford/6072486.stm',
110
+ :page => File.read("spec/fixtures/bbc_news/6072486.stm.html"),
111
+ :valid_hash => 'aaf7ed1219eb69c3126ea5d0774fbe7d'
112
+ }
113
+ @pa = BbcNewsPageParserV1.new(@valid_options)
114
+ end
115
+
116
+ it "should parse the title" do
117
+ @pa.title.should == "Son-in-law remanded over killing"
118
+ end
119
+
120
+ it "should parse the date in UTC" do
121
+ @pa.date.should == DateTime.parse("Sat Oct 21 14:41:10 +0000 2006")
122
+ @pa.date.zone.should == '+00:00'
123
+ end
124
+
125
+ it "should parse the content exactly like the old News Sniffer library" do
126
+ @pa.content.first.should == "<B>The son-in-law of a 73-year-old Castleford widow has been charged with her murder.</B>"
127
+ @pa.content.last.should == 'He denied the charges against him through his solicitor and is due to appear at Leeds Crown Court on Friday.'
128
+ @pa.content.size.should == 5
129
+ @pa.hash.should == @valid_options[:valid_hash]
130
+ end
131
+
132
+ it "should convert apostrophe and pound sign html entities in content" do
133
+ @pa = BbcNewsPageParserV1.new :page => 'S SF -->John&apos;s code sucks &amp; blows<!-- E BO'
134
+ @pa.content.to_s.should match Regexp.new("John's")
135
+ @pa.content.to_s.should match /sucks & blows/
136
+ end
137
+
138
+ it "should convert apostrophe and pound sign html entities in page titles" do
139
+ @pa = BbcNewsPageParserV1.new :page => '<meta name="Headline" content="John&apos;s code sucks &amp; blows!"/>'
140
+ @pa.title.should match Regexp.new("John's")
141
+ @pa.title.should match /sucks & blows/
142
+ end
143
+
144
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s
3
+ --loadby mtime
4
+ --reverse
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-page-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.10"
5
+ platform: ruby
6
+ authors:
7
+ - John Leach
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-20 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: oniguruma
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.1.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 4.0.0
34
+ version:
35
+ description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
36
+ email: john@johnleach.co.uk
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README.rdoc
43
+ - LICENSE
44
+ files:
45
+ - lib/web-page-parser
46
+ - lib/web-page-parser/base_parser.rb
47
+ - lib/web-page-parser/parsers
48
+ - lib/web-page-parser/parsers/test_page_parser.rb
49
+ - lib/web-page-parser/parsers/bbc_news_page_parser.rb
50
+ - lib/web-page-parser/parser_factory.rb
51
+ - lib/web-page-parser.rb
52
+ - README.rdoc
53
+ - LICENSE
54
+ has_rdoc: true
55
+ homepage: http://github.com/johnl/web-page-parser/tree/master
56
+ post_install_message:
57
+ rdoc_options: []
58
+
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ requirements: []
74
+
75
+ rubyforge_project: web-page-parser
76
+ rubygems_version: 1.3.1
77
+ signing_key:
78
+ specification_version: 2
79
+ summary: A parser for web pages
80
+ test_files:
81
+ - spec/parser_factory_spec.rb
82
+ - spec/base_parser_spec.rb
83
+ - spec/fixtures
84
+ - spec/fixtures/bbc_news
85
+ - spec/fixtures/bbc_news/8063681.stm.html
86
+ - spec/fixtures/bbc_news/8011268.stm.html
87
+ - spec/fixtures/bbc_news/6072486.stm.html
88
+ - spec/fixtures/bbc_news/8029015.stm.html
89
+ - spec/fixtures/bbc_news/7745137.stm.html
90
+ - spec/parsers
91
+ - spec/parsers/bbc_news_page_spec.rb
92
+ - spec/spec.opts