content_scrapper 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.specification ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: content_scrapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Gyorgy Frivolt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-13 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thoughtbot-shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.10.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mocha
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.8
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: sanitize
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.2.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: nokogiri
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.1
54
+ version:
55
+ description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
56
+ email: gyorgy.frivolt@gmail.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.rdoc
64
+ files:
65
+ - .document
66
+ - .gitignore
67
+ - LICENSE
68
+ - README.rdoc
69
+ - Rakefile
70
+ - VERSION
71
+ - config/content_scrapper.rb
72
+ - content_scrapper.gemspec
73
+ - lib/content_scrapper.rb
74
+ - lib/content_scrapper/content_mapping.rb
75
+ - lib/content_scrapper/feedzirra.rb
76
+ - rails/init.rb
77
+ - test/helper.rb
78
+ - test/test_content_mapping.rb
79
+ - test/test_content_scrapper.rb
80
+ - test/test_pages/pretty.html
81
+ - test/test_pages/twocontent.html
82
+ - test/test_pages/ugly.html
83
+ has_rdoc: true
84
+ homepage: http://github.com/fifigyuri/content_scrapper
85
+ licenses: []
86
+
87
+ post_install_message:
88
+ rdoc_options:
89
+ - --charset=UTF-8
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 1.3.5
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
111
+ test_files:
112
+ - test/test_content_mapping.rb
113
+ - test/test_content_scrapper.rb
114
+ - test/helper.rb
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
12
- s.date = %q{2010-02-13}
12
+ s.date = %q{2010-02-22}
13
13
  s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
14
14
  s.email = %q{gyorgy.frivolt@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  ".gitignore",
22
+ ".specification",
22
23
  "LICENSE",
23
24
  "README.rdoc",
24
25
  "Rakefile",
@@ -19,10 +19,12 @@ class ContentMapping
19
19
  url =~ @url_pattern_regexp
20
20
  end
21
21
 
22
- def scrap_content(doc)
22
+ def scrap_content(doc, content_scrapper = nil)
23
23
  @content_xpaths_list.each do |content_xpath|
24
24
  content_section = doc.xpath(content_xpath)
25
- return content_section.to_a.join("\n") if content_section.count > 0
25
+ content = content_section.to_a.join("\n")
26
+ content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
27
+ return content if content_section.count > 0
26
28
  end
27
29
  nil
28
30
  end
@@ -24,7 +24,7 @@ class ContentScrapper
24
24
 
25
25
  def initialize(scrapper_config_file = nil)
26
26
  @content_mappings = []
27
- config_file = ContentScrapper.default_config_file
27
+ config_file = scrapper_config_file || ContentScrapper.default_config_file
28
28
  self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
29
29
  end
30
30
 
@@ -34,8 +34,8 @@ class ContentScrapper
34
34
  @content_mappings << new_mapping
35
35
  end
36
36
 
37
- def sanitize_tags(settings)
38
- @sanitize_settings = settings
37
+ def sanitize_tags(&block)
38
+ @sanitize_settings = block.call()
39
39
  end
40
40
 
41
41
  def scrap_content(url)
@@ -44,17 +44,23 @@ class ContentScrapper
44
44
  return nil if content_mapping.content_xpaths_list.empty?
45
45
  begin
46
46
  doc = Nokogiri::HTML(Kernel.open(url))
47
- content = content_mapping.scrap_content(doc)
48
- return nil if content.nil?
49
- return Sanitize.clean(content, sanitize_settings)
47
+ return content_mapping.scrap_content(doc, content_scrapper = self)
50
48
  rescue Exception
51
- scrap_content_exception($!)
49
+ @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
50
+ return nil
52
51
  end
53
52
  end
54
53
  end
54
+ @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
55
55
  nil
56
56
  end
57
57
 
58
- def scrap_content_exception(exception)
58
+ def rescue_scrapping(&block)
59
+ @scrapping_exception_handler_block = block
60
+ end
61
+
62
+ def missing_url_matcher(&block)
63
+ @missing_url_matcher_handler_block = block
59
64
  end
60
65
  end
66
+
@@ -26,7 +26,8 @@ class TestContentMapping < Test::Unit::TestCase
26
26
  @document = Nokogiri::HTML(pretty_content)
27
27
  end
28
28
  should "extract the content" do
29
- assert_match(%r{<p><strong>This is a strong text</strong></p>}, @mapping.scrap_content(@document))
29
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
30
+ @mapping.scrap_content(@document))
30
31
  end
31
32
  end
32
33
  context "on document with two content parts" do
@@ -5,7 +5,7 @@ class TestContentScrapper < Test::Unit::TestCase
5
5
 
6
6
  ContentScrapper.default_config_file = nil
7
7
 
8
- context "on common setting" do
8
+ context "on common settings" do
9
9
  setup do
10
10
  @scrapper = ContentScrapper.new
11
11
  @scrapper.instance_eval do
@@ -35,8 +35,10 @@ class TestContentScrapper < Test::Unit::TestCase
35
35
  content_at '//div[@id="never_should_be_here"]'
36
36
  end
37
37
 
38
- sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
39
- :attributes => { 'a' => ['href'] }})
38
+ sanitize_tags do
39
+ {:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
40
+ :attributes => { 'a' => ['href'] }}
41
+ end
40
42
  end
41
43
  end
42
44
 
@@ -113,6 +115,34 @@ class TestContentScrapper < Test::Unit::TestCase
113
115
  end
114
116
  end
115
117
  end
118
+
119
+ context "on failing scrapping" do
120
+ setup do
121
+ Kernel.expects(:open).raises(Exception, 'something failed')
122
+ @exception_handle_flag = nil
123
+ @scrapper.rescue_scrapping do |exception|
124
+ @exception_handle_flag = exception.message
125
+ end
126
+ end
127
+ should "catch the exception and handle it" do
128
+ assert_nil @scrapper.scrap_content('http://www.pretty.url')
129
+ assert_equal 'something failed', @exception_handle_flag
130
+ end
131
+ end
132
+
133
+ context "on missing url matcher" do
134
+ setup do
135
+ Kernel.expects(:open).never
136
+ @missing_url_matcher_flag = nil
137
+ @scrapper.missing_url_matcher do |url|
138
+ @missing_url_matcher_flag = url
139
+ end
140
+ @scrapper.scrap_content('http://missing.url.matcher')
141
+ end
142
+ should "call the handler block" do
143
+ assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
144
+ end
145
+ end
116
146
  end
117
147
 
118
148
  context "on setting default content scrapper" do
@@ -126,6 +156,7 @@ class TestContentScrapper < Test::Unit::TestCase
126
156
  assert_equal @new_scrapper, ContentScrapper.default
127
157
  end
128
158
  end
159
+
129
160
  context "for feed entry" do
130
161
  setup do
131
162
  @feed_entry = Feedzirra::Parser::RSSEntry.new
@@ -138,3 +169,4 @@ class TestContentScrapper < Test::Unit::TestCase
138
169
  end
139
170
  end
140
171
  end
172
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-13 00:00:00 +01:00
12
+ date: 2010-02-22 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -64,6 +64,7 @@ extra_rdoc_files:
64
64
  files:
65
65
  - .document
66
66
  - .gitignore
67
+ - .specification
67
68
  - LICENSE
68
69
  - README.rdoc
69
70
  - Rakefile