web-page-parser 0.10 → 0.21

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,17 +10,24 @@ describe BbcNewsPageParserFactory do
10
10
  "http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
11
11
  "http://news.bbc.co.uk/1/hi/uk/7995652.stm",
12
12
  "http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
13
- "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
13
+ "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
14
+ "http://www.bbc.co.uk/news/business-11125504",
15
+ "http://www.bbc.co.uk/news/10604897"
14
16
  ]
15
17
  @invalid_urls = [
16
18
  "http://news.bbc.co.uk/2/hi/health/default.stm",
17
19
  "http://news.bbc.co.uk/2/low/europe/default.stm",
18
20
  "http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
19
21
  "http://news.bbc.co.uk/sport",
22
+ "http://news.bbc.co.uk/sport1/hi/tennis/8951357.stm",
20
23
  "http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
21
24
  "http://www.bbc.co.uk/blogs/nickrobinson/",
22
25
  "http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
23
- "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
26
+ "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm",
27
+ "http://www.bbc.co.uk/blogs/theeditors/",
28
+ "http://www.bbc.co.uk/news/have_your_say/",
29
+ "http://news.bbc.co.uk/1/hi/magazine/default.stm",
30
+ "http://news.bbc.co.uk/1/hi/in_pictures/default.stm"
24
31
  ]
25
32
  end
26
33
 
@@ -41,6 +48,61 @@ describe BbcNewsPageParserFactory do
41
48
  end
42
49
  end
43
50
 
51
+ describe BbcNewsPageParserV4 do
52
+ it_should_behave_like AllPageParsers
53
+ before do
54
+ @valid_options = {
55
+ :url => 'http://www.bbc.co.uk/news/business-11125504',
56
+ :page => File.read("spec/fixtures/bbc_news/11125504.html"),
57
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
58
+ }
59
+ @pa = BbcNewsPageParserV4.new(@valid_options)
60
+ end
61
+
62
+ it "should parse the title" do
63
+ @pa.title.should == "UK economy 'to pick up in near term'"
64
+ end
65
+
66
+ it "should parse the content" do
67
+ @pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
68
+ @pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
69
+ @pa.content.size.should == 18
70
+ end
71
+
72
+ end
73
+
74
+
75
+ describe BbcNewsPageParserV3 do
76
+ it_should_behave_like AllPageParsers
77
+ before do
78
+ @valid_options = {
79
+ :url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
80
+ :page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
81
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
82
+ }
83
+ @pa = BbcNewsPageParserV3.new(@valid_options)
84
+ end
85
+
86
+ it "should parse the content" do
87
+ @pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
88
+ @pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
89
+ @pa.content.size.should == 24
90
+ end
91
+
92
+ it "should parse the content of an article with two captions" do
93
+ @pa = BbcNewsPageParserV3.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
94
+ :page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
95
+ :valid_hash => 'unknown'
96
+ })
97
+ @pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
98
+ @pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
99
+ @pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
100
+ end
101
+
102
+
103
+
104
+ end
105
+
44
106
  describe BbcNewsPageParserV2 do
45
107
  it_should_behave_like AllPageParsers
46
108
  before do
@@ -140,5 +202,5 @@ describe BbcNewsPageParserV1 do
140
202
  @pa.title.should match Regexp.new("John's")
141
203
  @pa.title.should match /sucks & blows/
142
204
  end
143
-
205
+
144
206
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.10"
4
+ hash: 33
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 21
9
+ version: "0.21"
5
10
  platform: ruby
6
11
  authors:
7
12
  - John Leach
@@ -9,29 +14,41 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2009-06-20 00:00:00 +01:00
17
+ date: 2010-08-30 00:00:00 +01:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: oniguruma
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
20
25
  requirements:
21
26
  - - ">="
22
27
  - !ruby/object:Gem::Version
28
+ hash: 19
29
+ segments:
30
+ - 1
31
+ - 1
32
+ - 0
23
33
  version: 1.1.0
24
- version:
34
+ type: :runtime
35
+ version_requirements: *id001
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: htmlentities
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
30
41
  requirements:
31
42
  - - ">="
32
43
  - !ruby/object:Gem::Version
44
+ hash: 63
45
+ segments:
46
+ - 4
47
+ - 0
48
+ - 0
33
49
  version: 4.0.0
34
- version:
50
+ type: :runtime
51
+ version_requirements: *id002
35
52
  description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
36
53
  email: john@johnleach.co.uk
37
54
  executables: []
@@ -42,51 +59,69 @@ extra_rdoc_files:
42
59
  - README.rdoc
43
60
  - LICENSE
44
61
  files:
45
- - lib/web-page-parser
62
+ - lib/web-page-parser/parser_factory.rb
46
63
  - lib/web-page-parser/base_parser.rb
47
- - lib/web-page-parser/parsers
48
64
  - lib/web-page-parser/parsers/test_page_parser.rb
49
65
  - lib/web-page-parser/parsers/bbc_news_page_parser.rb
50
- - lib/web-page-parser/parser_factory.rb
51
66
  - lib/web-page-parser.rb
52
67
  - README.rdoc
53
68
  - LICENSE
69
+ - spec/parser_factory_spec.rb
70
+ - spec/base_parser_spec.rb
71
+ - spec/parsers/bbc_news_page_spec.rb
72
+ - spec/fixtures/bbc_news/10341015.stm.html
73
+ - spec/fixtures/bbc_news/8029015.stm.html
74
+ - spec/fixtures/bbc_news/7745137.stm.html
75
+ - spec/fixtures/bbc_news/8063681.stm.html
76
+ - spec/fixtures/bbc_news/10249066.stm.html
77
+ - spec/fixtures/bbc_news/8011268.stm.html
78
+ - spec/fixtures/bbc_news/11125504.html
79
+ - spec/fixtures/bbc_news/6072486.stm.html
80
+ - spec/spec.opts
54
81
  has_rdoc: true
55
82
  homepage: http://github.com/johnl/web-page-parser/tree/master
83
+ licenses: []
84
+
56
85
  post_install_message:
57
86
  rdoc_options: []
58
87
 
59
88
  require_paths:
60
89
  - lib
61
90
  required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
62
92
  requirements:
63
93
  - - ">="
64
94
  - !ruby/object:Gem::Version
95
+ hash: 3
96
+ segments:
97
+ - 0
65
98
  version: "0"
66
- version:
67
99
  required_rubygems_version: !ruby/object:Gem::Requirement
100
+ none: false
68
101
  requirements:
69
102
  - - ">="
70
103
  - !ruby/object:Gem::Version
104
+ hash: 3
105
+ segments:
106
+ - 0
71
107
  version: "0"
72
- version:
73
108
  requirements: []
74
109
 
75
110
  rubyforge_project: web-page-parser
76
- rubygems_version: 1.3.1
111
+ rubygems_version: 1.3.7
77
112
  signing_key:
78
- specification_version: 2
113
+ specification_version: 3
79
114
  summary: A parser for web pages
80
115
  test_files:
81
116
  - spec/parser_factory_spec.rb
82
117
  - spec/base_parser_spec.rb
83
- - spec/fixtures
84
- - spec/fixtures/bbc_news
118
+ - spec/parsers/bbc_news_page_spec.rb
119
+ - spec/fixtures/bbc_news/10341015.stm.html
120
+ - spec/fixtures/bbc_news/8029015.stm.html
121
+ - spec/fixtures/bbc_news/7745137.stm.html
85
122
  - spec/fixtures/bbc_news/8063681.stm.html
123
+ - spec/fixtures/bbc_news/10249066.stm.html
86
124
  - spec/fixtures/bbc_news/8011268.stm.html
125
+ - spec/fixtures/bbc_news/11125504.html
87
126
  - spec/fixtures/bbc_news/6072486.stm.html
88
- - spec/fixtures/bbc_news/8029015.stm.html
89
- - spec/fixtures/bbc_news/7745137.stm.html
90
- - spec/parsers
91
- - spec/parsers/bbc_news_page_spec.rb
92
127
  - spec/spec.opts