web-page-parser 0.10 → 0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,17 +10,24 @@ describe BbcNewsPageParserFactory do
10
10
  "http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
11
11
  "http://news.bbc.co.uk/1/hi/uk/7995652.stm",
12
12
  "http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
13
- "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
13
+ "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
14
+ "http://www.bbc.co.uk/news/business-11125504",
15
+ "http://www.bbc.co.uk/news/10604897"
14
16
  ]
15
17
  @invalid_urls = [
16
18
  "http://news.bbc.co.uk/2/hi/health/default.stm",
17
19
  "http://news.bbc.co.uk/2/low/europe/default.stm",
18
20
  "http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
19
21
  "http://news.bbc.co.uk/sport",
22
+ "http://news.bbc.co.uk/sport1/hi/tennis/8951357.stm",
20
23
  "http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
21
24
  "http://www.bbc.co.uk/blogs/nickrobinson/",
22
25
  "http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
23
- "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
26
+ "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm",
27
+ "http://www.bbc.co.uk/blogs/theeditors/",
28
+ "http://www.bbc.co.uk/news/have_your_say/",
29
+ "http://news.bbc.co.uk/1/hi/magazine/default.stm",
30
+ "http://news.bbc.co.uk/1/hi/in_pictures/default.stm"
24
31
  ]
25
32
  end
26
33
 
@@ -41,6 +48,61 @@ describe BbcNewsPageParserFactory do
41
48
  end
42
49
  end
43
50
 
51
+ describe BbcNewsPageParserV4 do
52
+ it_should_behave_like AllPageParsers
53
+ before do
54
+ @valid_options = {
55
+ :url => 'http://www.bbc.co.uk/news/business-11125504',
56
+ :page => File.read("spec/fixtures/bbc_news/11125504.html"),
57
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
58
+ }
59
+ @pa = BbcNewsPageParserV4.new(@valid_options)
60
+ end
61
+
62
+ it "should parse the title" do
63
+ @pa.title.should == "UK economy 'to pick up in near term'"
64
+ end
65
+
66
+ it "should parse the content" do
67
+ @pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
68
+ @pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
69
+ @pa.content.size.should == 18
70
+ end
71
+
72
+ end
73
+
74
+
75
+ describe BbcNewsPageParserV3 do
76
+ it_should_behave_like AllPageParsers
77
+ before do
78
+ @valid_options = {
79
+ :url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
80
+ :page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
81
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
82
+ }
83
+ @pa = BbcNewsPageParserV3.new(@valid_options)
84
+ end
85
+
86
+ it "should parse the content" do
87
+ @pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
88
+ @pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
89
+ @pa.content.size.should == 24
90
+ end
91
+
92
+ it "should parse the content of an article with two captions" do
93
+ @pa = BbcNewsPageParserV3.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
94
+ :page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
95
+ :valid_hash => 'unknown'
96
+ })
97
+ @pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
98
+ @pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
99
+ @pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
100
+ end
101
+
102
+
103
+
104
+ end
105
+
44
106
  describe BbcNewsPageParserV2 do
45
107
  it_should_behave_like AllPageParsers
46
108
  before do
@@ -140,5 +202,5 @@ describe BbcNewsPageParserV1 do
140
202
  @pa.title.should match Regexp.new("John's")
141
203
  @pa.title.should match /sucks & blows/
142
204
  end
143
-
205
+
144
206
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.10"
4
+ hash: 33
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 21
9
+ version: "0.21"
5
10
  platform: ruby
6
11
  authors:
7
12
  - John Leach
@@ -9,29 +14,41 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2009-06-20 00:00:00 +01:00
17
+ date: 2010-08-30 00:00:00 +01:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: oniguruma
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
20
25
  requirements:
21
26
  - - ">="
22
27
  - !ruby/object:Gem::Version
28
+ hash: 19
29
+ segments:
30
+ - 1
31
+ - 1
32
+ - 0
23
33
  version: 1.1.0
24
- version:
34
+ type: :runtime
35
+ version_requirements: *id001
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: htmlentities
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
30
41
  requirements:
31
42
  - - ">="
32
43
  - !ruby/object:Gem::Version
44
+ hash: 63
45
+ segments:
46
+ - 4
47
+ - 0
48
+ - 0
33
49
  version: 4.0.0
34
- version:
50
+ type: :runtime
51
+ version_requirements: *id002
35
52
  description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
36
53
  email: john@johnleach.co.uk
37
54
  executables: []
@@ -42,51 +59,69 @@ extra_rdoc_files:
42
59
  - README.rdoc
43
60
  - LICENSE
44
61
  files:
45
- - lib/web-page-parser
62
+ - lib/web-page-parser/parser_factory.rb
46
63
  - lib/web-page-parser/base_parser.rb
47
- - lib/web-page-parser/parsers
48
64
  - lib/web-page-parser/parsers/test_page_parser.rb
49
65
  - lib/web-page-parser/parsers/bbc_news_page_parser.rb
50
- - lib/web-page-parser/parser_factory.rb
51
66
  - lib/web-page-parser.rb
52
67
  - README.rdoc
53
68
  - LICENSE
69
+ - spec/parser_factory_spec.rb
70
+ - spec/base_parser_spec.rb
71
+ - spec/parsers/bbc_news_page_spec.rb
72
+ - spec/fixtures/bbc_news/10341015.stm.html
73
+ - spec/fixtures/bbc_news/8029015.stm.html
74
+ - spec/fixtures/bbc_news/7745137.stm.html
75
+ - spec/fixtures/bbc_news/8063681.stm.html
76
+ - spec/fixtures/bbc_news/10249066.stm.html
77
+ - spec/fixtures/bbc_news/8011268.stm.html
78
+ - spec/fixtures/bbc_news/11125504.html
79
+ - spec/fixtures/bbc_news/6072486.stm.html
80
+ - spec/spec.opts
54
81
  has_rdoc: true
55
82
  homepage: http://github.com/johnl/web-page-parser/tree/master
83
+ licenses: []
84
+
56
85
  post_install_message:
57
86
  rdoc_options: []
58
87
 
59
88
  require_paths:
60
89
  - lib
61
90
  required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
62
92
  requirements:
63
93
  - - ">="
64
94
  - !ruby/object:Gem::Version
95
+ hash: 3
96
+ segments:
97
+ - 0
65
98
  version: "0"
66
- version:
67
99
  required_rubygems_version: !ruby/object:Gem::Requirement
100
+ none: false
68
101
  requirements:
69
102
  - - ">="
70
103
  - !ruby/object:Gem::Version
104
+ hash: 3
105
+ segments:
106
+ - 0
71
107
  version: "0"
72
- version:
73
108
  requirements: []
74
109
 
75
110
  rubyforge_project: web-page-parser
76
- rubygems_version: 1.3.1
111
+ rubygems_version: 1.3.7
77
112
  signing_key:
78
- specification_version: 2
113
+ specification_version: 3
79
114
  summary: A parser for web pages
80
115
  test_files:
81
116
  - spec/parser_factory_spec.rb
82
117
  - spec/base_parser_spec.rb
83
- - spec/fixtures
84
- - spec/fixtures/bbc_news
118
+ - spec/parsers/bbc_news_page_spec.rb
119
+ - spec/fixtures/bbc_news/10341015.stm.html
120
+ - spec/fixtures/bbc_news/8029015.stm.html
121
+ - spec/fixtures/bbc_news/7745137.stm.html
85
122
  - spec/fixtures/bbc_news/8063681.stm.html
123
+ - spec/fixtures/bbc_news/10249066.stm.html
86
124
  - spec/fixtures/bbc_news/8011268.stm.html
125
+ - spec/fixtures/bbc_news/11125504.html
87
126
  - spec/fixtures/bbc_news/6072486.stm.html
88
- - spec/fixtures/bbc_news/8029015.stm.html
89
- - spec/fixtures/bbc_news/7745137.stm.html
90
- - spec/parsers
91
- - spec/parsers/bbc_news_page_spec.rb
92
127
  - spec/spec.opts