content_scrapper 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -13,7 +13,6 @@ begin
13
13
  gem.add_development_dependency 'thoughtbot-shoulda', '>=2.10.2'
14
14
  gem.add_development_dependency 'mocha', '>=0.9.8'
15
15
 
16
- gem.add_dependency 'sanitize', '>=1.2.0'
17
16
  gem.add_dependency 'nokogiri', '>=1.4.1'
18
17
  end
19
18
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.6
1
+ 0.0.7
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.6"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
@@ -19,7 +19,6 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  ".gitignore",
22
- ".specification",
23
22
  "LICENSE",
24
23
  "README.rdoc",
25
24
  "Rakefile",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
33
32
  "test/helper.rb",
34
33
  "test/test_content_mapping.rb",
35
34
  "test/test_content_scrapper.rb",
35
+ "test/test_pages.rb",
36
+ "test/test_pages/cdata.html",
36
37
  "test/test_pages/encoding.html",
37
38
  "test/test_pages/pretty.html",
38
39
  "test/test_pages/twocontent.html",
@@ -46,6 +47,7 @@ Gem::Specification.new do |s|
46
47
  s.test_files = [
47
48
  "test/test_content_mapping.rb",
48
49
  "test/test_content_scrapper.rb",
50
+ "test/test_pages.rb",
49
51
  "test/helper.rb"
50
52
  ]
51
53
 
@@ -56,18 +58,15 @@ Gem::Specification.new do |s|
56
58
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
57
59
  s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
58
60
  s.add_development_dependency(%q<mocha>, [">= 0.9.8"])
59
- s.add_runtime_dependency(%q<sanitize>, [">= 1.2.0"])
60
61
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
61
62
  else
62
63
  s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
63
64
  s.add_dependency(%q<mocha>, [">= 0.9.8"])
64
- s.add_dependency(%q<sanitize>, [">= 1.2.0"])
65
65
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
66
66
  end
67
67
  else
68
68
  s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
69
69
  s.add_dependency(%q<mocha>, [">= 0.9.8"])
70
- s.add_dependency(%q<sanitize>, [">= 1.2.0"])
71
70
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
72
71
  end
73
72
  end
@@ -1,7 +1,6 @@
1
1
 
2
2
  require 'open-uri'
3
3
  require 'nokogiri'
4
- require 'sanitize'
5
4
 
6
5
  require 'content_scrapper/content_mapping'
7
6
 
@@ -20,7 +19,7 @@ class ContentScrapper
20
19
  ContentScrapper.default = self
21
20
  end
22
21
 
23
- attr_accessor :content_mappings, :sanitize_settings
22
+ attr_reader :content_mappings
24
23
 
25
24
  def initialize(scrapper_config_file = nil)
26
25
  @content_mappings = []
@@ -34,8 +33,22 @@ class ContentScrapper
34
33
  @content_mappings << new_mapping
35
34
  end
36
35
 
37
- def sanitize_tags(&block)
38
- @sanitize_settings = block.call()
36
+ def clean_content(content)
37
+ @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
38
+ end
39
+
40
+ def sanitize_tags(&sanitize_settings)
41
+ @content_cleaner_block = lambda do |content|
42
+ require 'sanitize'
43
+ Sanitize.clean(content, sanitize_settings.call())
44
+ end
45
+ end
46
+
47
+ def loofah_tags(scrap_type)
48
+ @content_scrapper_block = lambda do |content|
49
+ require 'loofah'
50
+ Loofah.document(content).scrub!(scrap_type).to_s
51
+ end
39
52
  end
40
53
 
41
54
  def scrap_content(url)
@@ -29,10 +29,11 @@ class ContentMapping
29
29
  @content_xpaths_list.each do |content_xpath|
30
30
  content_section = doc.xpath(content_xpath)
31
31
  content = content_section.to_a.join("\n")
32
- content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
32
+ content = content_scrapper.clean_content(content) unless content_scrapper.nil?
33
33
  content = Iconv.conv(to=iconv_to, from=iconv_from, content) unless iconv_to.nil?
34
34
  return content if content_section.count > 0
35
35
  end
36
36
  nil
37
37
  end
38
38
  end
39
+
@@ -9,7 +9,7 @@ class TestContentMapping < Test::Unit::TestCase
9
9
  @mapping = ContentMapping.new
10
10
  @mapping.instance_eval do
11
11
  url_pattern /^http:\/\/www\.matchme\.com\//
12
- content_at '//div[@id="failing_content"]'
12
+ content_at '//div[@id="failing_content"]'
13
13
  content_at '//div[@id="itext_content"]'
14
14
  content_at '//div[@id="itext_second_content"]'
15
15
  end
@@ -46,7 +46,7 @@ class TestContentMapping < Test::Unit::TestCase
46
46
  @mapping = ContentMapping.new
47
47
  @mapping.instance_eval do
48
48
  url_pattern /^http:\/\/www\.matchme\.com\//
49
- content_at '//div[@class="node node-story"]/div[@class="content"]/p'
49
+ content_at '//div[@class="node node-story"]/div[@class="content"]/p'
50
50
  iconv :to => 'utf8', :from => 'latin1'
51
51
  end
52
52
  page = File.open("#{File.dirname(__FILE__)}/test_pages/encoding.html").read
@@ -5,6 +5,24 @@ class TestContentScrapper < Test::Unit::TestCase
5
5
 
6
6
  ContentScrapper.default_config_file = nil
7
7
 
8
+ context "on settings without sanitization tags" do
9
+ setup do
10
+ @scrapper = ContentScrapper.new
11
+ @scrapper.instance_eval do
12
+ content_mapping do
13
+ url_pattern /.*/
14
+ content_at '//div[@id="itext_content"]'
15
+ end
16
+ end
17
+ content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
18
+ stringio = StringIO.new(content)
19
+ Kernel.expects(:open).returns(stringio)
20
+ end
21
+ should 'not sanitize' do
22
+ assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
23
+ end
24
+ end
25
+
8
26
  context "on common settings" do
9
27
  setup do
10
28
  @scrapper = ContentScrapper.new
@@ -0,0 +1,29 @@
1
+
2
+ require 'helper'
3
+ require 'mocha'
4
+
5
+ class TestContentScrapper < Test::Unit::TestCase
6
+
7
+ context "on page containing CDATA" do
8
+ setup do
9
+ @scrapper = ContentScrapper.new
10
+ @scrapper.instance_eval do
11
+ content_mapping do
12
+ url_pattern /.*/
13
+ content_at '//div[@class="art-full adwords-text"]'
14
+ end
15
+ loofah_tags(:strip)
16
+ end
17
+ @scrapper.rescue_scrapping do |exception|
18
+ puts exception
19
+ end
20
+ cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
21
+ Kernel.expects(:open).returns(StringIO.new(cdata_content))
22
+ end
23
+ should "not escape the cdata entries, should leave cdata unvisible" do
24
+ #<!--<![CDATA[
25
+ assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,23 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <body>
3
+ <div class="art-full adwords-text">
4
+ <h1>Jud�nyov�: Celebrity sveta m�dy za�ila na vlastnej ko�i</h1>
5
+ <div id="zoom" class="zoom2">
6
+ <div class="art-info">Pravda.sk |
7
+ <span class="time-date">25.&nbsp;febru�ra&nbsp;2010&nbsp;&nbsp;5:51</span>
8
+ </div>
9
+ <div class="perex" id="article-perex">Moder�torka Erika Jud�nyov� sa ned�vno vr�tila z v�letu v New Yorku. Z��astnila sa tam na m�dnom t��dni a s telev�znym �t�bom aj nieko�ko prehliadok. </div>
10
+ <ul>
11
+ <li class="fotogaleria">
12
+ <a href="/foto.asp?r=sk-kkoktail&amp;c=A100224_165338_sk-kkoktail_p20">
13
+ Gal�ria: Erika Jud�nyov�</a>
14
+ <script type="text/javascript"><!--<![CDATA[
15
+ /* SLAVE: perex_sk.perex_sk.perex.koktail.center */
16
+ ado.slave('adoceanskqdisnunpvu', {myMaster: 'uikHnAPTNwh_AVZX4uAdPP6xUQPhUSb01rCKlMcgapn.97' });
17
+ //]]>--></script>
18
+ <!-- [/Koktail/CENTER] -->
19
+ </div><p>&quot;Videli sme napr�klad �ou Custo Barcelona alebo prehliadku Very Wangovej,&quot; prezradila pre Pravda.sk moder�torka Smot�nky. Spolu so �t�bom zaznamen�vala dianie. Jeden z najzauj�mavej��ch �lovkov pre kameru mark�zackej rel�cie bola ��fredaktorka magaz�nu Vogue Anna Wintour. &quot;Pri�la na prehliadku, nasadila si tmav� okuliare a potichu z�vala. �o ma v�ak najviac prekvapilo, nezostala do konca prehliadky, asi tri min�ty pred koncom sa pov��enecky zdvihla a odi�la,&quot; zaspom�nala si Jud�nyov�.</p>
20
+ </li>
21
+ </ul>
22
+ </body>
23
+ </html>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -32,16 +32,6 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.9.8
34
34
  version:
35
- - !ruby/object:Gem::Dependency
36
- name: sanitize
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.2.0
44
- version:
45
35
  - !ruby/object:Gem::Dependency
46
36
  name: nokogiri
47
37
  type: :runtime
@@ -64,7 +54,6 @@ extra_rdoc_files:
64
54
  files:
65
55
  - .document
66
56
  - .gitignore
67
- - .specification
68
57
  - LICENSE
69
58
  - README.rdoc
70
59
  - Rakefile
@@ -78,6 +67,8 @@ files:
78
67
  - test/helper.rb
79
68
  - test/test_content_mapping.rb
80
69
  - test/test_content_scrapper.rb
70
+ - test/test_pages.rb
71
+ - test/test_pages/cdata.html
81
72
  - test/test_pages/encoding.html
82
73
  - test/test_pages/pretty.html
83
74
  - test/test_pages/twocontent.html
@@ -113,4 +104,5 @@ summary: Gem for those who want to screen scrap only the content part of web pag
113
104
  test_files:
114
105
  - test/test_content_mapping.rb
115
106
  - test/test_content_scrapper.rb
107
+ - test/test_pages.rb
116
108
  - test/helper.rb
data/.specification DELETED
@@ -1,115 +0,0 @@
1
- --- !ruby/object:Gem::Specification
2
- name: content_scrapper
3
- version: !ruby/object:Gem::Version
4
- version: 99.99.99
5
- platform: ruby
6
- authors:
7
- - Gyorgy Frivolt
8
- autorequire:
9
- bindir: bin
10
- cert_chain: []
11
-
12
- date: 2010-02-13 00:00:00 +01:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: thoughtbot-shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.10.2
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: mocha
27
- type: :development
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.9.8
34
- version:
35
- - !ruby/object:Gem::Dependency
36
- name: sanitize
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.2.0
44
- version:
45
- - !ruby/object:Gem::Dependency
46
- name: nokogiri
47
- type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
50
- requirements:
51
- - - ">="
52
- - !ruby/object:Gem::Version
53
- version: 1.4.1
54
- version:
55
- description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
56
- email: gyorgy.frivolt@gmail.com
57
- executables: []
58
-
59
- extensions: []
60
-
61
- extra_rdoc_files:
62
- - LICENSE
63
- - README.rdoc
64
- files:
65
- - .document
66
- - .gitignore
67
- - LICENSE
68
- - README.rdoc
69
- - Rakefile
70
- - VERSION
71
- - config/content_scrapper.rb
72
- - content_scrapper.gemspec
73
- - lib/content_scrapper.rb
74
- - lib/content_scrapper/content_mapping.rb
75
- - lib/content_scrapper/feedzirra.rb
76
- - rails/init.rb
77
- - test/helper.rb
78
- - test/test_content_mapping.rb
79
- - test/test_content_scrapper.rb
80
- - test/test_pages/pretty.html
81
- - test/test_pages/twocontent.html
82
- - test/test_pages/ugly.html
83
- has_rdoc: true
84
- homepage: http://github.com/fifigyuri/content_scrapper
85
- licenses: []
86
-
87
- post_install_message:
88
- rdoc_options:
89
- - --charset=UTF-8
90
- require_paths:
91
- - bin
92
- - lib
93
- required_ruby_version: !ruby/object:Gem::Requirement
94
- requirements:
95
- - - ">="
96
- - !ruby/object:Gem::Version
97
- version: "0"
98
- version:
99
- required_rubygems_version: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: "0"
104
- version:
105
- requirements: []
106
-
107
- rubyforge_project:
108
- rubygems_version: 1.3.5
109
- signing_key:
110
- specification_version: 3
111
- summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
112
- test_files:
113
- - test/test_content_mapping.rb
114
- - test/test_content_scrapper.rb
115
- - test/helper.rb