web-page-parser 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
+ require 'web-page-parser'
3
+ include WebPageParser
4
+
5
+ describe ParserFactory do
6
+
7
+ it "should load parsers in the parsers directory" do
8
+ ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
9
+ end
10
+
11
+ it "should provide the right PageParser for the given url" do
12
+ ParserFactory.parser_for(:url => "http://www.example.com").should be_a_kind_of TestPageParser
13
+ end
14
+
15
+ it "should return nil if no PageParser can be found for the given url" do
16
+ ParserFactory.parser_for(:url => "http://www.nowhere.nodomain").should be_nil
17
+ end
18
+ end
@@ -0,0 +1,144 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../../lib')
2
+ require 'spec/base_parser_spec'
3
+ require 'web-page-parser'
4
+ include WebPageParser
5
+
6
+ describe BbcNewsPageParserFactory do
7
+ before do
8
+ @valid_urls = [
9
+ "http://news.bbc.co.uk/1/hi/entertainment/6984082.stm",
10
+ "http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
11
+ "http://news.bbc.co.uk/1/hi/uk/7995652.stm",
12
+ "http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
13
+ "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
14
+ ]
15
+ @invalid_urls = [
16
+ "http://news.bbc.co.uk/2/hi/health/default.stm",
17
+ "http://news.bbc.co.uk/2/low/europe/default.stm",
18
+ "http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
19
+ "http://news.bbc.co.uk/sport",
20
+ "http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
21
+ "http://www.bbc.co.uk/blogs/nickrobinson/",
22
+ "http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
23
+ "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
24
+ ]
25
+ end
26
+
27
+ it "should detect bbc news articles from the url" do
28
+ @valid_urls.each do |url|
29
+ BbcNewsPageParserFactory.can_parse?(:url => url).should be_true
30
+ end
31
+ end
32
+
33
+ it "should ignore pages with the wrong url format" do
34
+ @invalid_urls.each do |url|
35
+ BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
36
+ end
37
+ end
38
+
39
+ it "should ignore 'in pictures' articles" do
40
+ BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
41
+ end
42
+ end
43
+
44
+ describe BbcNewsPageParserV2 do
45
+ it_should_behave_like AllPageParsers
46
+ before do
47
+ @valid_options = {
48
+ :url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
49
+ :page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
50
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
51
+ }
52
+ @pa = BbcNewsPageParserV2.new(@valid_options)
53
+ end
54
+
55
+ it "should parse the title" do
56
+ @pa.title.should == "Obama invites Middle East heads"
57
+ end
58
+
59
+ it "should convert iso-8859-1 in the title to utf8" do
60
+ page = BbcNewsPageParserV2.new(:page => '<meta name="Headline" content="'+"\243"+'100K mortgage claim by Woodward"')
61
+ page.title.should == "£100K mortgage claim by Woodward"
62
+ end
63
+
64
+ it "should convert iso-8859-1 in the content to utf8" do
65
+ page = BbcNewsPageParserV2.new(:page => "S BO -->\243100K mortgage claim by Woodward<!-- E BO")
66
+ page.content.first.should == "£100K mortgage claim by Woodward"
67
+ end
68
+
69
+
70
+ it "should parse the date in UTC" do
71
+ # 2009/04/21 19:50:44
72
+ @pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
73
+ @pa.date.zone.should == '+00:00'
74
+ end
75
+
76
+ it "should parse the content" do
77
+ @pa.content[0].should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
78
+ @pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
79
+ @pa.content.size.should == 15
80
+ end
81
+
82
+ it "should decode html entities" do
83
+ @pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
84
+ end
85
+
86
+ it "should calculate a valid hash of the content" do
87
+ @pa.hash.should == @valid_options[:valid_hash]
88
+ end
89
+
90
+ it "should parse 'from our own correspondent' pages" do
91
+ page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/programmes/from_our_own_correspondent/8029015.stm",
92
+ :page => File.read("spec/fixtures/bbc_news/8029015.stm.html"))
93
+ page.title.should == "Cairo's terrifying traffic chaos"
94
+ page.content.first.should == "Christian Fraser discovers that a brush with death on Cairo's congested roads leaves no appetite for life in the fast lane."
95
+ end
96
+
97
+ it "should parse 'magazine' pages" do
98
+ page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/magazine/8063681.stm",
99
+ :page => File.read("spec/fixtures/bbc_news/8063681.stm.html"))
100
+ page.title.should == "My night with Parisien prostitutes"
101
+ page.content.first.should == "Wandering around the red light district of Paris as a teenager taught me all I need to know - about teenagers, not women, says Laurie Taylor in his weekly column."
102
+ end
103
+
104
+ end
105
+
106
+ describe BbcNewsPageParserV1 do
107
+ before do
108
+ @valid_options = {
109
+ :url => 'http://news.bbc.co.uk/1/hi/england/bradford/6072486.stm',
110
+ :page => File.read("spec/fixtures/bbc_news/6072486.stm.html"),
111
+ :valid_hash => 'aaf7ed1219eb69c3126ea5d0774fbe7d'
112
+ }
113
+ @pa = BbcNewsPageParserV1.new(@valid_options)
114
+ end
115
+
116
+ it "should parse the title" do
117
+ @pa.title.should == "Son-in-law remanded over killing"
118
+ end
119
+
120
+ it "should parse the date in UTC" do
121
+ @pa.date.should == DateTime.parse("Sat Oct 21 14:41:10 +0000 2006")
122
+ @pa.date.zone.should == '+00:00'
123
+ end
124
+
125
+ it "should parse the content exactly like the old News Sniffer library" do
126
+ @pa.content.first.should == "<B>The son-in-law of a 73-year-old Castleford widow has been charged with her murder.</B>"
127
+ @pa.content.last.should == 'He denied the charges against him through his solicitor and is due to appear at Leeds Crown Court on Friday.'
128
+ @pa.content.size.should == 5
129
+ @pa.hash.should == @valid_options[:valid_hash]
130
+ end
131
+
132
+ it "should convert apostrophe and pound sign html entities in content" do
133
+ @pa = BbcNewsPageParserV1.new :page => 'S SF -->John&apos;s code sucks &amp; blows<!-- E BO'
134
+ @pa.content.to_s.should match Regexp.new("John's")
135
+ @pa.content.to_s.should match /sucks & blows/
136
+ end
137
+
138
+ it "should convert apostrophe and pound sign html entities in page titles" do
139
+ @pa = BbcNewsPageParserV1.new :page => '<meta name="Headline" content="John&apos;s code sucks &amp; blows!"/>'
140
+ @pa.title.should match Regexp.new("John's")
141
+ @pa.title.should match /sucks & blows/
142
+ end
143
+
144
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s
3
+ --loadby mtime
4
+ --reverse
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-page-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.10"
5
+ platform: ruby
6
+ authors:
7
+ - John Leach
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-20 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: oniguruma
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.1.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 4.0.0
34
+ version:
35
+ description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
36
+ email: john@johnleach.co.uk
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README.rdoc
43
+ - LICENSE
44
+ files:
45
+ - lib/web-page-parser
46
+ - lib/web-page-parser/base_parser.rb
47
+ - lib/web-page-parser/parsers
48
+ - lib/web-page-parser/parsers/test_page_parser.rb
49
+ - lib/web-page-parser/parsers/bbc_news_page_parser.rb
50
+ - lib/web-page-parser/parser_factory.rb
51
+ - lib/web-page-parser.rb
52
+ - README.rdoc
53
+ - LICENSE
54
+ has_rdoc: true
55
+ homepage: http://github.com/johnl/web-page-parser/tree/master
56
+ post_install_message:
57
+ rdoc_options: []
58
+
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ requirements: []
74
+
75
+ rubyforge_project: web-page-parser
76
+ rubygems_version: 1.3.1
77
+ signing_key:
78
+ specification_version: 2
79
+ summary: A parser for web pages
80
+ test_files:
81
+ - spec/parser_factory_spec.rb
82
+ - spec/base_parser_spec.rb
83
+ - spec/fixtures
84
+ - spec/fixtures/bbc_news
85
+ - spec/fixtures/bbc_news/8063681.stm.html
86
+ - spec/fixtures/bbc_news/8011268.stm.html
87
+ - spec/fixtures/bbc_news/6072486.stm.html
88
+ - spec/fixtures/bbc_news/8029015.stm.html
89
+ - spec/fixtures/bbc_news/7745137.stm.html
90
+ - spec/parsers
91
+ - spec/parsers/bbc_news_page_spec.rb
92
+ - spec/spec.opts