content_scrapper 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -13,7 +13,6 @@ begin
13
13
  gem.add_development_dependency 'thoughtbot-shoulda', '>=2.10.2'
14
14
  gem.add_development_dependency 'mocha', '>=0.9.8'
15
15
 
16
- gem.add_dependency 'sanitize', '>=1.2.0'
17
16
  gem.add_dependency 'nokogiri', '>=1.4.1'
18
17
  end
19
18
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.6
1
+ 0.0.7
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.6"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
@@ -19,7 +19,6 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  ".gitignore",
22
- ".specification",
23
22
  "LICENSE",
24
23
  "README.rdoc",
25
24
  "Rakefile",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
33
32
  "test/helper.rb",
34
33
  "test/test_content_mapping.rb",
35
34
  "test/test_content_scrapper.rb",
35
+ "test/test_pages.rb",
36
+ "test/test_pages/cdata.html",
36
37
  "test/test_pages/encoding.html",
37
38
  "test/test_pages/pretty.html",
38
39
  "test/test_pages/twocontent.html",
@@ -46,6 +47,7 @@ Gem::Specification.new do |s|
46
47
  s.test_files = [
47
48
  "test/test_content_mapping.rb",
48
49
  "test/test_content_scrapper.rb",
50
+ "test/test_pages.rb",
49
51
  "test/helper.rb"
50
52
  ]
51
53
 
@@ -56,18 +58,15 @@ Gem::Specification.new do |s|
56
58
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
57
59
  s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
58
60
  s.add_development_dependency(%q<mocha>, [">= 0.9.8"])
59
- s.add_runtime_dependency(%q<sanitize>, [">= 1.2.0"])
60
61
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
61
62
  else
62
63
  s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
63
64
  s.add_dependency(%q<mocha>, [">= 0.9.8"])
64
- s.add_dependency(%q<sanitize>, [">= 1.2.0"])
65
65
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
66
66
  end
67
67
  else
68
68
  s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
69
69
  s.add_dependency(%q<mocha>, [">= 0.9.8"])
70
- s.add_dependency(%q<sanitize>, [">= 1.2.0"])
71
70
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
72
71
  end
73
72
  end
@@ -1,7 +1,6 @@
1
1
 
2
2
  require 'open-uri'
3
3
  require 'nokogiri'
4
- require 'sanitize'
5
4
 
6
5
  require 'content_scrapper/content_mapping'
7
6
 
@@ -20,7 +19,7 @@ class ContentScrapper
20
19
  ContentScrapper.default = self
21
20
  end
22
21
 
23
- attr_accessor :content_mappings, :sanitize_settings
22
+ attr_reader :content_mappings
24
23
 
25
24
  def initialize(scrapper_config_file = nil)
26
25
  @content_mappings = []
@@ -34,8 +33,22 @@ class ContentScrapper
34
33
  @content_mappings << new_mapping
35
34
  end
36
35
 
37
- def sanitize_tags(&block)
38
- @sanitize_settings = block.call()
36
+ def clean_content(content)
37
+ @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
38
+ end
39
+
40
+ def sanitize_tags(&sanitize_settings)
41
+ @content_cleaner_block = lambda do |content|
42
+ require 'sanitize'
43
+ Sanitize.clean(content, sanitize_settings.call())
44
+ end
45
+ end
46
+
47
+ def loofah_tags(scrap_type)
48
+ @content_scrapper_block = lambda do |content|
49
+ require 'loofah'
50
+ Loofah.document(content).scrub!(scrap_type).to_s
51
+ end
39
52
  end
40
53
 
41
54
  def scrap_content(url)
@@ -29,10 +29,11 @@ class ContentMapping
29
29
  @content_xpaths_list.each do |content_xpath|
30
30
  content_section = doc.xpath(content_xpath)
31
31
  content = content_section.to_a.join("\n")
32
- content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
32
+ content = content_scrapper.clean_content(content) unless content_scrapper.nil?
33
33
  content = Iconv.conv(to=iconv_to, from=iconv_from, content) unless iconv_to.nil?
34
34
  return content if content_section.count > 0
35
35
  end
36
36
  nil
37
37
  end
38
38
  end
39
+
@@ -9,7 +9,7 @@ class TestContentMapping < Test::Unit::TestCase
9
9
  @mapping = ContentMapping.new
10
10
  @mapping.instance_eval do
11
11
  url_pattern /^http:\/\/www\.matchme\.com\//
12
- content_at '//div[@id="failing_content"]'
12
+ content_at '//div[@id="failing_content"]'
13
13
  content_at '//div[@id="itext_content"]'
14
14
  content_at '//div[@id="itext_second_content"]'
15
15
  end
@@ -46,7 +46,7 @@ class TestContentMapping < Test::Unit::TestCase
46
46
  @mapping = ContentMapping.new
47
47
  @mapping.instance_eval do
48
48
  url_pattern /^http:\/\/www\.matchme\.com\//
49
- content_at '//div[@class="node node-story"]/div[@class="content"]/p'
49
+ content_at '//div[@class="node node-story"]/div[@class="content"]/p'
50
50
  iconv :to => 'utf8', :from => 'latin1'
51
51
  end
52
52
  page = File.open("#{File.dirname(__FILE__)}/test_pages/encoding.html").read
@@ -5,6 +5,24 @@ class TestContentScrapper < Test::Unit::TestCase
5
5
 
6
6
  ContentScrapper.default_config_file = nil
7
7
 
8
+ context "on settings without sanitization tags" do
9
+ setup do
10
+ @scrapper = ContentScrapper.new
11
+ @scrapper.instance_eval do
12
+ content_mapping do
13
+ url_pattern /.*/
14
+ content_at '//div[@id="itext_content"]'
15
+ end
16
+ end
17
+ content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
18
+ stringio = StringIO.new(content)
19
+ Kernel.expects(:open).returns(stringio)
20
+ end
21
+ should 'not sanitize' do
22
+ assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
23
+ end
24
+ end
25
+
8
26
  context "on common settings" do
9
27
  setup do
10
28
  @scrapper = ContentScrapper.new
@@ -0,0 +1,29 @@
1
+
2
+ require 'helper'
3
+ require 'mocha'
4
+
5
+ class TestContentScrapper < Test::Unit::TestCase
6
+
7
+ context "on page containing CDATA" do
8
+ setup do
9
+ @scrapper = ContentScrapper.new
10
+ @scrapper.instance_eval do
11
+ content_mapping do
12
+ url_pattern /.*/
13
+ content_at '//div[@class="art-full adwords-text"]'
14
+ end
15
+ loofah_tags(:strip)
16
+ end
17
+ @scrapper.rescue_scrapping do |exception|
18
+ puts exception
19
+ end
20
+ cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
21
+ Kernel.expects(:open).returns(StringIO.new(cdata_content))
22
+ end
23
+ should "not escape the cdata entries, should leave cdata unvisible" do
24
+ #<!--<![CDATA[
25
+ assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,23 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <body>
3
+ <div class="art-full adwords-text">
4
+ <h1>Jud�nyov�: Celebrity sveta m�dy za�ila na vlastnej ko�i</h1>
5
+ <div id="zoom" class="zoom2">
6
+ <div class="art-info">Pravda.sk |
7
+ <span class="time-date">25.&nbsp;febru�ra&nbsp;2010&nbsp;&nbsp;5:51</span>
8
+ </div>
9
+ <div class="perex" id="article-perex">Moder�torka Erika Jud�nyov� sa ned�vno vr�tila z v�letu v New Yorku. Z��astnila sa tam na m�dnom t��dni a s telev�znym �t�bom aj nieko�ko prehliadok. </div>
10
+ <ul>
11
+ <li class="fotogaleria">
12
+ <a href="/foto.asp?r=sk-kkoktail&amp;c=A100224_165338_sk-kkoktail_p20">
13
+ Gal�ria: Erika Jud�nyov�</a>
14
+ <script type="text/javascript"><!--<![CDATA[
15
+ /* SLAVE: perex_sk.perex_sk.perex.koktail.center */
16
+ ado.slave('adoceanskqdisnunpvu', {myMaster: 'uikHnAPTNwh_AVZX4uAdPP6xUQPhUSb01rCKlMcgapn.97' });
17
+ //]]>--></script>
18
+ <!-- [/Koktail/CENTER] -->
19
+ </div><p>&quot;Videli sme napr�klad �ou Custo Barcelona alebo prehliadku Very Wangovej,&quot; prezradila pre Pravda.sk moder�torka Smot�nky. Spolu so �t�bom zaznamen�vala dianie. Jeden z najzauj�mavej��ch �lovkov pre kameru mark�zackej rel�cie bola ��fredaktorka magaz�nu Vogue Anna Wintour. &quot;Pri�la na prehliadku, nasadila si tmav� okuliare a potichu z�vala. �o ma v�ak najviac prekvapilo, nezostala do konca prehliadky, asi tri min�ty pred koncom sa pov��enecky zdvihla a odi�la,&quot; zaspom�nala si Jud�nyov�.</p>
20
+ </li>
21
+ </ul>
22
+ </body>
23
+ </html>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -32,16 +32,6 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.9.8
34
34
  version:
35
- - !ruby/object:Gem::Dependency
36
- name: sanitize
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.2.0
44
- version:
45
35
  - !ruby/object:Gem::Dependency
46
36
  name: nokogiri
47
37
  type: :runtime
@@ -64,7 +54,6 @@ extra_rdoc_files:
64
54
  files:
65
55
  - .document
66
56
  - .gitignore
67
- - .specification
68
57
  - LICENSE
69
58
  - README.rdoc
70
59
  - Rakefile
@@ -78,6 +67,8 @@ files:
78
67
  - test/helper.rb
79
68
  - test/test_content_mapping.rb
80
69
  - test/test_content_scrapper.rb
70
+ - test/test_pages.rb
71
+ - test/test_pages/cdata.html
81
72
  - test/test_pages/encoding.html
82
73
  - test/test_pages/pretty.html
83
74
  - test/test_pages/twocontent.html
@@ -113,4 +104,5 @@ summary: Gem for those who want to screen scrap only the content part of web pag
113
104
  test_files:
114
105
  - test/test_content_mapping.rb
115
106
  - test/test_content_scrapper.rb
107
+ - test/test_pages.rb
116
108
  - test/helper.rb
data/.specification DELETED
@@ -1,115 +0,0 @@
1
- --- !ruby/object:Gem::Specification
2
- name: content_scrapper
3
- version: !ruby/object:Gem::Version
4
- version: 99.99.99
5
- platform: ruby
6
- authors:
7
- - Gyorgy Frivolt
8
- autorequire:
9
- bindir: bin
10
- cert_chain: []
11
-
12
- date: 2010-02-13 00:00:00 +01:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: thoughtbot-shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.10.2
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: mocha
27
- type: :development
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.9.8
34
- version:
35
- - !ruby/object:Gem::Dependency
36
- name: sanitize
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.2.0
44
- version:
45
- - !ruby/object:Gem::Dependency
46
- name: nokogiri
47
- type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
50
- requirements:
51
- - - ">="
52
- - !ruby/object:Gem::Version
53
- version: 1.4.1
54
- version:
55
- description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
56
- email: gyorgy.frivolt@gmail.com
57
- executables: []
58
-
59
- extensions: []
60
-
61
- extra_rdoc_files:
62
- - LICENSE
63
- - README.rdoc
64
- files:
65
- - .document
66
- - .gitignore
67
- - LICENSE
68
- - README.rdoc
69
- - Rakefile
70
- - VERSION
71
- - config/content_scrapper.rb
72
- - content_scrapper.gemspec
73
- - lib/content_scrapper.rb
74
- - lib/content_scrapper/content_mapping.rb
75
- - lib/content_scrapper/feedzirra.rb
76
- - rails/init.rb
77
- - test/helper.rb
78
- - test/test_content_mapping.rb
79
- - test/test_content_scrapper.rb
80
- - test/test_pages/pretty.html
81
- - test/test_pages/twocontent.html
82
- - test/test_pages/ugly.html
83
- has_rdoc: true
84
- homepage: http://github.com/fifigyuri/content_scrapper
85
- licenses: []
86
-
87
- post_install_message:
88
- rdoc_options:
89
- - --charset=UTF-8
90
- require_paths:
91
- - bin
92
- - lib
93
- required_ruby_version: !ruby/object:Gem::Requirement
94
- requirements:
95
- - - ">="
96
- - !ruby/object:Gem::Version
97
- version: "0"
98
- version:
99
- required_rubygems_version: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: "0"
104
- version:
105
- requirements: []
106
-
107
- rubyforge_project:
108
- rubygems_version: 1.3.5
109
- signing_key:
110
- specification_version: 3
111
- summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
112
- test_files:
113
- - test/test_content_mapping.rb
114
- - test/test_content_scrapper.rb
115
- - test/helper.rb