content_scrapper 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/.specification ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: content_scrapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Gyorgy Frivolt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-13 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thoughtbot-shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.10.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mocha
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.8
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: sanitize
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.2.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: nokogiri
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.1
54
+ version:
55
+ description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
56
+ email: gyorgy.frivolt@gmail.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.rdoc
64
+ files:
65
+ - .document
66
+ - .gitignore
67
+ - LICENSE
68
+ - README.rdoc
69
+ - Rakefile
70
+ - VERSION
71
+ - config/content_scrapper.rb
72
+ - content_scrapper.gemspec
73
+ - lib/content_scrapper.rb
74
+ - lib/content_scrapper/content_mapping.rb
75
+ - lib/content_scrapper/feedzirra.rb
76
+ - rails/init.rb
77
+ - test/helper.rb
78
+ - test/test_content_mapping.rb
79
+ - test/test_content_scrapper.rb
80
+ - test/test_pages/pretty.html
81
+ - test/test_pages/twocontent.html
82
+ - test/test_pages/ugly.html
83
+ has_rdoc: true
84
+ homepage: http://github.com/fifigyuri/content_scrapper
85
+ licenses: []
86
+
87
+ post_install_message:
88
+ rdoc_options:
89
+ - --charset=UTF-8
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 1.3.5
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
111
+ test_files:
112
+ - test/test_content_mapping.rb
113
+ - test/test_content_scrapper.rb
114
+ - test/helper.rb
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
12
- s.date = %q{2010-02-13}
12
+ s.date = %q{2010-02-22}
13
13
  s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
14
14
  s.email = %q{gyorgy.frivolt@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  ".gitignore",
22
+ ".specification",
22
23
  "LICENSE",
23
24
  "README.rdoc",
24
25
  "Rakefile",
@@ -19,10 +19,12 @@ class ContentMapping
19
19
  url =~ @url_pattern_regexp
20
20
  end
21
21
 
22
- def scrap_content(doc)
22
+ def scrap_content(doc, content_scrapper = nil)
23
23
  @content_xpaths_list.each do |content_xpath|
24
24
  content_section = doc.xpath(content_xpath)
25
- return content_section.to_a.join("\n") if content_section.count > 0
25
+ content = content_section.to_a.join("\n")
26
+ content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
27
+ return content if content_section.count > 0
26
28
  end
27
29
  nil
28
30
  end
@@ -24,7 +24,7 @@ class ContentScrapper
24
24
 
25
25
  def initialize(scrapper_config_file = nil)
26
26
  @content_mappings = []
27
- config_file = ContentScrapper.default_config_file
27
+ config_file = scrapper_config_file || ContentScrapper.default_config_file
28
28
  self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
29
29
  end
30
30
 
@@ -34,8 +34,8 @@ class ContentScrapper
34
34
  @content_mappings << new_mapping
35
35
  end
36
36
 
37
- def sanitize_tags(settings)
38
- @sanitize_settings = settings
37
+ def sanitize_tags(&block)
38
+ @sanitize_settings = block.call()
39
39
  end
40
40
 
41
41
  def scrap_content(url)
@@ -44,17 +44,23 @@ class ContentScrapper
44
44
  return nil if content_mapping.content_xpaths_list.empty?
45
45
  begin
46
46
  doc = Nokogiri::HTML(Kernel.open(url))
47
- content = content_mapping.scrap_content(doc)
48
- return nil if content.nil?
49
- return Sanitize.clean(content, sanitize_settings)
47
+ return content_mapping.scrap_content(doc, content_scrapper = self)
50
48
  rescue Exception
51
- scrap_content_exception($!)
49
+ @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
50
+ return nil
52
51
  end
53
52
  end
54
53
  end
54
+ @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
55
55
  nil
56
56
  end
57
57
 
58
- def scrap_content_exception(exception)
58
+ def rescue_scrapping(&block)
59
+ @scrapping_exception_handler_block = block
60
+ end
61
+
62
+ def missing_url_matcher(&block)
63
+ @missing_url_matcher_handler_block = block
59
64
  end
60
65
  end
66
+
@@ -26,7 +26,8 @@ class TestContentMapping < Test::Unit::TestCase
26
26
  @document = Nokogiri::HTML(pretty_content)
27
27
  end
28
28
  should "extract the content" do
29
- assert_match(%r{<p><strong>This is a strong text</strong></p>}, @mapping.scrap_content(@document))
29
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
30
+ @mapping.scrap_content(@document))
30
31
  end
31
32
  end
32
33
  context "on document with two content parts" do
@@ -5,7 +5,7 @@ class TestContentScrapper < Test::Unit::TestCase
5
5
 
6
6
  ContentScrapper.default_config_file = nil
7
7
 
8
- context "on common setting" do
8
+ context "on common settings" do
9
9
  setup do
10
10
  @scrapper = ContentScrapper.new
11
11
  @scrapper.instance_eval do
@@ -35,8 +35,10 @@ class TestContentScrapper < Test::Unit::TestCase
35
35
  content_at '//div[@id="never_should_be_here"]'
36
36
  end
37
37
 
38
- sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
39
- :attributes => { 'a' => ['href'] }})
38
+ sanitize_tags do
39
+ {:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
40
+ :attributes => { 'a' => ['href'] }}
41
+ end
40
42
  end
41
43
  end
42
44
 
@@ -113,6 +115,34 @@ class TestContentScrapper < Test::Unit::TestCase
113
115
  end
114
116
  end
115
117
  end
118
+
119
+ context "on failing scrapping" do
120
+ setup do
121
+ Kernel.expects(:open).raises(Exception, 'something failed')
122
+ @exception_handle_flag = nil
123
+ @scrapper.rescue_scrapping do |exception|
124
+ @exception_handle_flag = exception.message
125
+ end
126
+ end
127
+ should "catch the exception and handle it" do
128
+ assert_nil @scrapper.scrap_content('http://www.pretty.url')
129
+ assert_equal 'something failed', @exception_handle_flag
130
+ end
131
+ end
132
+
133
+ context "on missing url matcher" do
134
+ setup do
135
+ Kernel.expects(:open).never
136
+ @missing_url_matcher_flag = nil
137
+ @scrapper.missing_url_matcher do |url|
138
+ @missing_url_matcher_flag = url
139
+ end
140
+ @scrapper.scrap_content('http://missing.url.matcher')
141
+ end
142
+ should "call the handler block" do
143
+ assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
144
+ end
145
+ end
116
146
  end
117
147
 
118
148
  context "on setting default content scrapper" do
@@ -126,6 +156,7 @@ class TestContentScrapper < Test::Unit::TestCase
126
156
  assert_equal @new_scrapper, ContentScrapper.default
127
157
  end
128
158
  end
159
+
129
160
  context "for feed entry" do
130
161
  setup do
131
162
  @feed_entry = Feedzirra::Parser::RSSEntry.new
@@ -138,3 +169,4 @@ class TestContentScrapper < Test::Unit::TestCase
138
169
  end
139
170
  end
140
171
  end
172
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-13 00:00:00 +01:00
12
+ date: 2010-02-22 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -64,6 +64,7 @@ extra_rdoc_files:
64
64
  files:
65
65
  - .document
66
66
  - .gitignore
67
+ - .specification
67
68
  - LICENSE
68
69
  - README.rdoc
69
70
  - Rakefile