newscrapi 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.document +5 -0
  2. data/.gitignore +23 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +17 -0
  5. data/Rakefile +56 -0
  6. data/VERSION +1 -0
  7. data/config/content_scrapper.rb +3 -0
  8. data/doc/classes/ContentMapping.html +242 -0
  9. data/doc/classes/ContentMapping.src/M000001.html +18 -0
  10. data/doc/classes/ContentMapping.src/M000002.html +18 -0
  11. data/doc/classes/ContentMapping.src/M000003.html +18 -0
  12. data/doc/classes/ContentMapping.src/M000004.html +19 -0
  13. data/doc/classes/ContentMapping.src/M000005.html +18 -0
  14. data/doc/classes/ContentMapping.src/M000006.html +25 -0
  15. data/doc/classes/ContentScrapper.html +297 -0
  16. data/doc/classes/ContentScrapper.src/M000007.html +18 -0
  17. data/doc/classes/ContentScrapper.src/M000008.html +18 -0
  18. data/doc/classes/ContentScrapper.src/M000009.html +20 -0
  19. data/doc/classes/ContentScrapper.src/M000010.html +20 -0
  20. data/doc/classes/ContentScrapper.src/M000011.html +18 -0
  21. data/doc/classes/ContentScrapper.src/M000012.html +21 -0
  22. data/doc/classes/ContentScrapper.src/M000013.html +21 -0
  23. data/doc/classes/ContentScrapper.src/M000014.html +33 -0
  24. data/doc/classes/ContentScrapper.src/M000015.html +18 -0
  25. data/doc/classes/ContentScrapper.src/M000016.html +18 -0
  26. data/doc/classes/Feedzirra.html +111 -0
  27. data/doc/classes/Feedzirra/FeedEntryUtilities.html +152 -0
  28. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000017.html +18 -0
  29. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000018.html +18 -0
  30. data/doc/created.rid +1 -0
  31. data/doc/files/lib/content_scrapper/content_mapping_rb.html +108 -0
  32. data/doc/files/lib/content_scrapper/feedzirra_rb.html +115 -0
  33. data/doc/files/lib/content_scrapper_rb.html +112 -0
  34. data/doc/fr_class_index.html +30 -0
  35. data/doc/fr_file_index.html +29 -0
  36. data/doc/fr_method_index.html +44 -0
  37. data/doc/index.html +24 -0
  38. data/doc/rdoc-style.css +208 -0
  39. data/lib/newscrapi.rb +2 -0
  40. data/lib/newscrapi/encoding.rb +44 -0
  41. data/lib/newscrapi/feedzirra.rb +17 -0
  42. data/lib/newscrapi/mapping.rb +50 -0
  43. data/lib/newscrapi/scrapper.rb +129 -0
  44. data/lib/newscrapi/testing.rb +19 -0
  45. data/rails/init.rb +3 -0
  46. data/test/helper.rb +9 -0
  47. data/test/test_encoding.rb +43 -0
  48. data/test/test_mapping.rb +58 -0
  49. data/test/test_pages.rb +69 -0
  50. data/test/test_pages/cdata.html +23 -0
  51. data/test/test_pages/page_without_encoding_meta_tag.html +401 -0
  52. data/test/test_pages/pretty.html +17 -0
  53. data/test/test_pages/pretty_missing_content.html +17 -0
  54. data/test/test_pages/twocontent.html +11 -0
  55. data/test/test_pages/ugly.html +399 -0
  56. data/test/test_pages/utf-8_page.html +405 -0
  57. data/test/test_pages/windows-1250_page.html +460 -0
  58. data/test/test_scrapper.rb +257 -0
  59. metadata +191 -0
@@ -0,0 +1,257 @@
1
+ require 'helper'
2
+ require 'mocha'
3
+
4
+ class TestScrapper < Test::Unit::TestCase
5
+
6
+ Newscrapi::Scrapper.default_config_file = nil
7
+
8
+ context "on settings without sanitization tags" do
9
+ setup do
10
+ @scrapper = Newscrapi::Scrapper.new
11
+ @scrapper.instance_eval do
12
+ content_mapping do
13
+ url_pattern /.*/
14
+ content_at '//div[@id="itext_content"]'
15
+ end
16
+ end
17
+ content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
18
+ stringio = StringIO.new(content)
19
+ Kernel.expects(:open).returns(stringio)
20
+ end
21
+ should 'not sanitize' do
22
+ assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
23
+ end
24
+ end
25
+
26
+ context "on common settings" do
27
+ setup do
28
+ @scrapper = Newscrapi::Scrapper.new
29
+ @scrapper.instance_eval do
30
+ content_mapping do
31
+ url_pattern /^http:\/\/www\.pretty\.url/
32
+ content_at '//div[@id="failing_content"]'
33
+ content_at '//div[@id="itext_content"]'
34
+ end
35
+
36
+ content_mapping do
37
+ url_pattern /^http:\/\/www\.twopatterns\.url/
38
+ content_at '//div[@id="failing_content"]'
39
+ content_at '//div[@id="itext_content"]'
40
+ end
41
+
42
+ content_mapping do
43
+ url_pattern /^http:\/\/www\.twopatterns\.url/
44
+ content_at '//div[@id="itext_second_content"]'
45
+ end
46
+
47
+ content_mapping do
48
+ url_pattern /^http:\/\/www\.skipper\.url/
49
+ end
50
+
51
+ content_mapping do
52
+ url_pattern /^http:\/\/www\.skipper\.url/
53
+ content_at '//div[@id="never_should_be_here"]'
54
+ end
55
+
56
+ sanitize_tags do
57
+ {:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
58
+ :attributes => { 'a' => ['href'] }}
59
+ end
60
+ end
61
+ end
62
+
63
+ should "identify the correct content mapper" do
64
+ content_mapper = @scrapper.matching_content_mapper('http://www.pretty.url/fsdsd')
65
+ assert !content_mapper.nil?
66
+ assert_equal /^http:\/\/www\.pretty\.url/, content_mapper.url_pattern_regexp
67
+ end
68
+
69
+ context "for known sources with expected content scrapping" do
70
+ setup do
71
+ pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
72
+ stringio = StringIO.new(pretty_content)
73
+ Kernel.expects(:open).returns(stringio)
74
+ @entry_content = @scrapper.scrap_content('http://www.pretty.url/fsdsd')
75
+ end
76
+ should("identify the content") do
77
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, @entry_content)
78
+ end
79
+ end
80
+
81
+ context "for known pages with unexpected content scrapping" do
82
+ setup do
83
+ ugly_content = File.open("#{File.dirname(__FILE__)}/test_pages/ugly.html").read
84
+ stringio = StringIO.new(ugly_content)
85
+ Kernel.expects(:open).returns(stringio)
86
+ @entry_content = @scrapper.scrap_content('http://www.pretty.url/hsdae')
87
+ end
88
+ should("return nil") { assert_nil @entry_content }
89
+ end
90
+
91
+ context "for unknown pages" do
92
+ setup { @entry_content = @scrapper.scrap_content('http://www.unknown.url/hsdae') }
93
+ should("return nil") { assert_nil @entry_content }
94
+ end
95
+
96
+ context "multiple matching url patterns" do
97
+ setup do
98
+ twocontent = File.open("#{File.dirname(__FILE__)}/test_pages/twocontent.html").read
99
+ stringio = StringIO.new(twocontent)
100
+ Kernel.expects(:open).with('http://www.twopatterns.url').returns(stringio)
101
+ @entry_content = @scrapper.scrap_content('http://www.twopatterns.url')
102
+ end
103
+ should "match the first content" do
104
+ assert_equal 'The first one is matched', @entry_content
105
+ end
106
+ end
107
+
108
+ context "skipper patterns" do
109
+ setup do
110
+ Kernel.expects(:open).with('http://www.skipper.url/fdgsw').never
111
+ @entry_content = @scrapper.scrap_content('http://www.skipper.url/fdgsw')
112
+ end
113
+ should("not match enything") { assert_nil @entry_content }
114
+ end
115
+
116
+ context "on already downloaded document" do
117
+ setup do
118
+ pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
119
+ Kernel.expects(:open).never
120
+ @scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
121
+ :use_page => pretty_content)
122
+ end
123
+ should "scrap from the provided full page" do
124
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
125
+ end
126
+ end
127
+
128
+ context "on scrapping with feedzirra" do
129
+ setup do
130
+ require 'newscrapi/feedzirra'
131
+ require 'sax-machine'
132
+ require 'feedzirra/parser/rss_entry'
133
+ require 'feedzirra/parser/atom_entry'
134
+ end
135
+
136
+ context "feed entry with not parsable remote content, but with feed content set" do
137
+ setup do
138
+ @feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
139
+ @feed_entries.each do |feed_entry|
140
+ feed_entry.url = 'http://www.unknown.url/wedhsf'
141
+ feed_entry.content = 'Pretty well written content is this.'
142
+ end
143
+ Kernel.expects(:open).with('http://www.unknown.url/wedhsf').never
144
+ end
145
+ should("return the original feed content") do
146
+ @feed_entries.each do |feed_entry|
147
+ assert_equal 'Pretty well written content is this.', feed_entry.scrap_content(@scrapper)
148
+ feed_entry.scrap_content!(@scrapper)
149
+ assert_equal 'Pretty well written content is this.', feed_entry.content
150
+ end
151
+ end
152
+ end
153
+
154
+ context "on feed entry with url and scrapping with full_page" do
155
+ setup do
156
+ @feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
157
+ @feed_entries.each do |feed_entry|
158
+ feed_entry.url = 'http://www.pretty.url/wedhsf'
159
+ end
160
+ @pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
161
+ Kernel.expects(:open).never
162
+ end
163
+ should("return the original feed content") do
164
+ @feed_entries.each do |feed_entry|
165
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
166
+ feed_entry.scrap_content(@scrapper, :use_page => @pretty_content))
167
+ feed_entry.scrap_content!(@scrapper, :use_page => @pretty_content)
168
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ context "on failing scrapping" do
175
+ setup do
176
+ Kernel.expects(:open).raises(Exception, 'something failed')
177
+ @exception_handle_flag = nil
178
+ @scrapper.rescue_scrapping do |exception, url|
179
+ @exception_handle_flag = exception.message
180
+ @exception_url = url
181
+ end
182
+ end
183
+ should "catch the exception and handle it" do
184
+ assert_nil @scrapper.scrap_content('http://www.pretty.url')
185
+ assert_equal 'something failed', @exception_handle_flag
186
+ assert_equal 'http://www.pretty.url', @exception_url
187
+ end
188
+ end
189
+
190
+ context "on missing url matcher" do
191
+ setup do
192
+ Kernel.expects(:open).never
193
+ @missing_url_matcher_flag = nil
194
+ @scrapper.missing_url_matcher do |url|
195
+ @missing_url_matcher_flag = url
196
+ end
197
+ @scrapper.scrap_content('http://missing.url.matcher')
198
+ end
199
+ should "call the handler block" do
200
+ assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
201
+ end
202
+ end
203
+
204
+ context "on matching content mapper finding empty content" do
205
+ setup do
206
+ twocontent = File.open("#{File.dirname(__FILE__)}/test_pages/pretty_missing_content.html").read
207
+ stringio = StringIO.new(twocontent)
208
+ Kernel.expects(:open).with('http://www.pretty.url').returns(stringio)
209
+ @missing_content_flag = 0
210
+ @scrapper.missing_content do |url|
211
+ @missing_content_flag += 1
212
+ end
213
+ @scrapper.scrap_content('http://www.pretty.url')
214
+ end
215
+ should "call the handler block only once" do
216
+ assert_equal 1, @missing_content_flag
217
+ end
218
+ end
219
+ end
220
+
221
+ context "on setting default content scrapper" do
222
+ setup { @scrapper = Newscrapi::Scrapper.create_new_default }
223
+ should "set the default to the recently created" do
224
+ assert_equal @scrapper, Newscrapi::Scrapper.default
225
+ end
226
+ context "when changing default content scrapper" do
227
+ setup { @new_scrapper = Newscrapi::Scrapper.new.set_as_default }
228
+ should "change the default to the new content scrapper" do
229
+ assert_equal @new_scrapper, Newscrapi::Scrapper.default
230
+ end
231
+ end
232
+
233
+ context "for feed entry" do
234
+ setup do
235
+ @feed_entry = Feedzirra::Parser::RSSEntry.new
236
+ @feed_entry.url = 'http://www.unknown.url/gerhe'
237
+ @feed_entry.content = 'We should get this.'
238
+ end
239
+ should("scrap content by the default scrapper") do
240
+ assert_equal 'We should get this.', @feed_entry.scrap_content
241
+ end
242
+ end
243
+ end
244
+
245
+ context "on giving standard package of settings" do
246
+ setup do
247
+ @scrapper = Newscrapi::Scrapper.new
248
+ @scrapper.report_to_stderr
249
+ end
250
+ should "have the reporters set" do
251
+ assert !@scrapper.scrapping_exception_handler_block.nil?
252
+ assert !@scrapper.missing_url_matcher_handler_block.nil?
253
+ assert !@scrapper.missing_content_handler_block.nil?
254
+ end
255
+ end
256
+ end
257
+
metadata ADDED
@@ -0,0 +1,191 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: newscrapi
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 11
10
+ version: 0.0.11
11
+ platform: ruby
12
+ authors:
13
+ - Gyorgy Frivolt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-11 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: thoughtbot-shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 35
30
+ segments:
31
+ - 2
32
+ - 10
33
+ - 2
34
+ version: 2.10.2
35
+ type: :development
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: mocha
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 43
46
+ segments:
47
+ - 0
48
+ - 9
49
+ - 8
50
+ version: 0.9.8
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: nokogiri
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 5
62
+ segments:
63
+ - 1
64
+ - 4
65
+ - 1
66
+ version: 1.4.1
67
+ type: :runtime
68
+ version_requirements: *id003
69
+ - !ruby/object:Gem::Dependency
70
+ name: rchardet
71
+ prerelease: false
72
+ requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ hash: 3
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ type: :runtime
82
+ version_requirements: *id004
83
+ description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
84
+ email: gyorgy.frivolt@gmail.com
85
+ executables: []
86
+
87
+ extensions: []
88
+
89
+ extra_rdoc_files:
90
+ - LICENSE
91
+ - README.rdoc
92
+ files:
93
+ - .document
94
+ - .gitignore
95
+ - LICENSE
96
+ - README.rdoc
97
+ - Rakefile
98
+ - VERSION
99
+ - config/content_scrapper.rb
100
+ - doc/classes/ContentMapping.html
101
+ - doc/classes/ContentMapping.src/M000001.html
102
+ - doc/classes/ContentMapping.src/M000002.html
103
+ - doc/classes/ContentMapping.src/M000003.html
104
+ - doc/classes/ContentMapping.src/M000004.html
105
+ - doc/classes/ContentMapping.src/M000005.html
106
+ - doc/classes/ContentMapping.src/M000006.html
107
+ - doc/classes/ContentScrapper.html
108
+ - doc/classes/ContentScrapper.src/M000007.html
109
+ - doc/classes/ContentScrapper.src/M000008.html
110
+ - doc/classes/ContentScrapper.src/M000009.html
111
+ - doc/classes/ContentScrapper.src/M000010.html
112
+ - doc/classes/ContentScrapper.src/M000011.html
113
+ - doc/classes/ContentScrapper.src/M000012.html
114
+ - doc/classes/ContentScrapper.src/M000013.html
115
+ - doc/classes/ContentScrapper.src/M000014.html
116
+ - doc/classes/ContentScrapper.src/M000015.html
117
+ - doc/classes/ContentScrapper.src/M000016.html
118
+ - doc/classes/Feedzirra.html
119
+ - doc/classes/Feedzirra/FeedEntryUtilities.html
120
+ - doc/classes/Feedzirra/FeedEntryUtilities.src/M000017.html
121
+ - doc/classes/Feedzirra/FeedEntryUtilities.src/M000018.html
122
+ - doc/created.rid
123
+ - doc/files/lib/content_scrapper/content_mapping_rb.html
124
+ - doc/files/lib/content_scrapper/feedzirra_rb.html
125
+ - doc/files/lib/content_scrapper_rb.html
126
+ - doc/fr_class_index.html
127
+ - doc/fr_file_index.html
128
+ - doc/fr_method_index.html
129
+ - doc/index.html
130
+ - doc/rdoc-style.css
131
+ - lib/newscrapi.rb
132
+ - lib/newscrapi/encoding.rb
133
+ - lib/newscrapi/feedzirra.rb
134
+ - lib/newscrapi/mapping.rb
135
+ - lib/newscrapi/scrapper.rb
136
+ - lib/newscrapi/testing.rb
137
+ - newscrapi.gemspec
138
+ - rails/init.rb
139
+ - test/helper.rb
140
+ - test/test_encoding.rb
141
+ - test/test_mapping.rb
142
+ - test/test_pages.rb
143
+ - test/test_pages/cdata.html
144
+ - test/test_pages/page_without_encoding_meta_tag.html
145
+ - test/test_pages/pretty.html
146
+ - test/test_pages/pretty_missing_content.html
147
+ - test/test_pages/twocontent.html
148
+ - test/test_pages/ugly.html
149
+ - test/test_pages/utf-8_page.html
150
+ - test/test_pages/windows-1250_page.html
151
+ - test/test_scrapper.rb
152
+ has_rdoc: true
153
+ homepage: http://github.com/fifigyuri/newscrapi
154
+ licenses: []
155
+
156
+ post_install_message:
157
+ rdoc_options:
158
+ - --charset=UTF-8
159
+ require_paths:
160
+ - lib
161
+ required_ruby_version: !ruby/object:Gem::Requirement
162
+ none: false
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ hash: 3
167
+ segments:
168
+ - 0
169
+ version: "0"
170
+ required_rubygems_version: !ruby/object:Gem::Requirement
171
+ none: false
172
+ requirements:
173
+ - - ">="
174
+ - !ruby/object:Gem::Version
175
+ hash: 3
176
+ segments:
177
+ - 0
178
+ version: "0"
179
+ requirements: []
180
+
181
+ rubyforge_project:
182
+ rubygems_version: 1.3.7
183
+ signing_key:
184
+ specification_version: 3
185
+ summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
186
+ test_files:
187
+ - test/helper.rb
188
+ - test/test_encoding.rb
189
+ - test/test_mapping.rb
190
+ - test/test_pages.rb
191
+ - test/test_scrapper.rb