content_scrapper 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/content_scrapper.gemspec +4 -5
- data/lib/content_scrapper.rb +17 -4
- data/lib/content_scrapper/content_mapping.rb +2 -1
- data/test/test_content_mapping.rb +2 -2
- data/test/test_content_scrapper.rb +18 -0
- data/test/test_pages.rb +29 -0
- data/test/test_pages/cdata.html +23 -0
- metadata +4 -12
- data/.specification +0 -115
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/content_scrapper.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
@@ -19,7 +19,6 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
".gitignore",
|
22
|
-
".specification",
|
23
22
|
"LICENSE",
|
24
23
|
"README.rdoc",
|
25
24
|
"Rakefile",
|
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
|
|
33
32
|
"test/helper.rb",
|
34
33
|
"test/test_content_mapping.rb",
|
35
34
|
"test/test_content_scrapper.rb",
|
35
|
+
"test/test_pages.rb",
|
36
|
+
"test/test_pages/cdata.html",
|
36
37
|
"test/test_pages/encoding.html",
|
37
38
|
"test/test_pages/pretty.html",
|
38
39
|
"test/test_pages/twocontent.html",
|
@@ -46,6 +47,7 @@ Gem::Specification.new do |s|
|
|
46
47
|
s.test_files = [
|
47
48
|
"test/test_content_mapping.rb",
|
48
49
|
"test/test_content_scrapper.rb",
|
50
|
+
"test/test_pages.rb",
|
49
51
|
"test/helper.rb"
|
50
52
|
]
|
51
53
|
|
@@ -56,18 +58,15 @@ Gem::Specification.new do |s|
|
|
56
58
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
57
59
|
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
58
60
|
s.add_development_dependency(%q<mocha>, [">= 0.9.8"])
|
59
|
-
s.add_runtime_dependency(%q<sanitize>, [">= 1.2.0"])
|
60
61
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
61
62
|
else
|
62
63
|
s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
63
64
|
s.add_dependency(%q<mocha>, [">= 0.9.8"])
|
64
|
-
s.add_dependency(%q<sanitize>, [">= 1.2.0"])
|
65
65
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
66
66
|
end
|
67
67
|
else
|
68
68
|
s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
69
69
|
s.add_dependency(%q<mocha>, [">= 0.9.8"])
|
70
|
-
s.add_dependency(%q<sanitize>, [">= 1.2.0"])
|
71
70
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
72
71
|
end
|
73
72
|
end
|
data/lib/content_scrapper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'nokogiri'
|
4
|
-
require 'sanitize'
|
5
4
|
|
6
5
|
require 'content_scrapper/content_mapping'
|
7
6
|
|
@@ -20,7 +19,7 @@ class ContentScrapper
|
|
20
19
|
ContentScrapper.default = self
|
21
20
|
end
|
22
21
|
|
23
|
-
|
22
|
+
attr_reader :content_mappings
|
24
23
|
|
25
24
|
def initialize(scrapper_config_file = nil)
|
26
25
|
@content_mappings = []
|
@@ -34,8 +33,22 @@ class ContentScrapper
|
|
34
33
|
@content_mappings << new_mapping
|
35
34
|
end
|
36
35
|
|
37
|
-
def
|
38
|
-
@
|
36
|
+
def clean_content(content)
|
37
|
+
@content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
|
38
|
+
end
|
39
|
+
|
40
|
+
def sanitize_tags(&sanitize_settings)
|
41
|
+
@content_cleaner_block = lambda do |content|
|
42
|
+
require 'sanitize'
|
43
|
+
Sanitize.clean(content, sanitize_settings.call())
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def loofah_tags(scrap_type)
|
48
|
+
@content_scrapper_block = lambda do |content|
|
49
|
+
require 'loofah'
|
50
|
+
Loofah.document(content).scrub!(scrap_type).to_s
|
51
|
+
end
|
39
52
|
end
|
40
53
|
|
41
54
|
def scrap_content(url)
|
@@ -29,10 +29,11 @@ class ContentMapping
|
|
29
29
|
@content_xpaths_list.each do |content_xpath|
|
30
30
|
content_section = doc.xpath(content_xpath)
|
31
31
|
content = content_section.to_a.join("\n")
|
32
|
-
content =
|
32
|
+
content = content_scrapper.clean_content(content) unless content_scrapper.nil?
|
33
33
|
content = Iconv.conv(to=iconv_to, from=iconv_from, content) unless iconv_to.nil?
|
34
34
|
return content if content_section.count > 0
|
35
35
|
end
|
36
36
|
nil
|
37
37
|
end
|
38
38
|
end
|
39
|
+
|
@@ -9,7 +9,7 @@ class TestContentMapping < Test::Unit::TestCase
|
|
9
9
|
@mapping = ContentMapping.new
|
10
10
|
@mapping.instance_eval do
|
11
11
|
url_pattern /^http:\/\/www\.matchme\.com\//
|
12
|
-
|
12
|
+
content_at '//div[@id="failing_content"]'
|
13
13
|
content_at '//div[@id="itext_content"]'
|
14
14
|
content_at '//div[@id="itext_second_content"]'
|
15
15
|
end
|
@@ -46,7 +46,7 @@ class TestContentMapping < Test::Unit::TestCase
|
|
46
46
|
@mapping = ContentMapping.new
|
47
47
|
@mapping.instance_eval do
|
48
48
|
url_pattern /^http:\/\/www\.matchme\.com\//
|
49
|
-
|
49
|
+
content_at '//div[@class="node node-story"]/div[@class="content"]/p'
|
50
50
|
iconv :to => 'utf8', :from => 'latin1'
|
51
51
|
end
|
52
52
|
page = File.open("#{File.dirname(__FILE__)}/test_pages/encoding.html").read
|
@@ -5,6 +5,24 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
5
5
|
|
6
6
|
ContentScrapper.default_config_file = nil
|
7
7
|
|
8
|
+
context "on settings without sanitization tags" do
|
9
|
+
setup do
|
10
|
+
@scrapper = ContentScrapper.new
|
11
|
+
@scrapper.instance_eval do
|
12
|
+
content_mapping do
|
13
|
+
url_pattern /.*/
|
14
|
+
content_at '//div[@id="itext_content"]'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
18
|
+
stringio = StringIO.new(content)
|
19
|
+
Kernel.expects(:open).returns(stringio)
|
20
|
+
end
|
21
|
+
should 'not sanitize' do
|
22
|
+
assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
8
26
|
context "on common settings" do
|
9
27
|
setup do
|
10
28
|
@scrapper = ContentScrapper.new
|
data/test/test_pages.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
|
2
|
+
require 'helper'
|
3
|
+
require 'mocha'
|
4
|
+
|
5
|
+
class TestContentScrapper < Test::Unit::TestCase
|
6
|
+
|
7
|
+
context "on page containing CDATA" do
|
8
|
+
setup do
|
9
|
+
@scrapper = ContentScrapper.new
|
10
|
+
@scrapper.instance_eval do
|
11
|
+
content_mapping do
|
12
|
+
url_pattern /.*/
|
13
|
+
content_at '//div[@class="art-full adwords-text"]'
|
14
|
+
end
|
15
|
+
loofah_tags(:strip)
|
16
|
+
end
|
17
|
+
@scrapper.rescue_scrapping do |exception|
|
18
|
+
puts exception
|
19
|
+
end
|
20
|
+
cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
|
21
|
+
Kernel.expects(:open).returns(StringIO.new(cdata_content))
|
22
|
+
end
|
23
|
+
should "not escape the cdata entries, should leave cdata unvisible" do
|
24
|
+
#<!--<![CDATA[
|
25
|
+
assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<body>
|
3
|
+
<div class="art-full adwords-text">
|
4
|
+
<h1>Jud�nyov�: Celebrity sveta m�dy za�ila na vlastnej ko�i</h1>
|
5
|
+
<div id="zoom" class="zoom2">
|
6
|
+
<div class="art-info">Pravda.sk |
|
7
|
+
<span class="time-date">25. febru�ra 2010 5:51</span>
|
8
|
+
</div>
|
9
|
+
<div class="perex" id="article-perex">Moder�torka Erika Jud�nyov� sa ned�vno vr�tila z v�letu v New Yorku. Z��astnila sa tam na m�dnom t��dni a s telev�znym �t�bom aj nieko�ko prehliadok. </div>
|
10
|
+
<ul>
|
11
|
+
<li class="fotogaleria">
|
12
|
+
<a href="/foto.asp?r=sk-kkoktail&c=A100224_165338_sk-kkoktail_p20">
|
13
|
+
Gal�ria: Erika Jud�nyov�</a>
|
14
|
+
<script type="text/javascript"><!--<![CDATA[
|
15
|
+
/* SLAVE: perex_sk.perex_sk.perex.koktail.center */
|
16
|
+
ado.slave('adoceanskqdisnunpvu', {myMaster: 'uikHnAPTNwh_AVZX4uAdPP6xUQPhUSb01rCKlMcgapn.97' });
|
17
|
+
//]]>--></script>
|
18
|
+
<!-- [/Koktail/CENTER] -->
|
19
|
+
</div><p>"Videli sme napr�klad �ou Custo Barcelona alebo prehliadku Very Wangovej," prezradila pre Pravda.sk moder�torka Smot�nky. Spolu so �t�bom zaznamen�vala dianie. Jeden z najzauj�mavej��ch �lovkov pre kameru mark�zackej rel�cie bola ��fredaktorka magaz�nu Vogue Anna Wintour. "Pri�la na prehliadku, nasadila si tmav� okuliare a potichu z�vala. �o ma v�ak najviac prekvapilo, nezostala do konca prehliadky, asi tri min�ty pred koncom sa pov��enecky zdvihla a odi�la," zaspom�nala si Jud�nyov�.</p>
|
20
|
+
</li>
|
21
|
+
</ul>
|
22
|
+
</body>
|
23
|
+
</html>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -32,16 +32,6 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.9.8
|
34
34
|
version:
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: sanitize
|
37
|
-
type: :runtime
|
38
|
-
version_requirement:
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 1.2.0
|
44
|
-
version:
|
45
35
|
- !ruby/object:Gem::Dependency
|
46
36
|
name: nokogiri
|
47
37
|
type: :runtime
|
@@ -64,7 +54,6 @@ extra_rdoc_files:
|
|
64
54
|
files:
|
65
55
|
- .document
|
66
56
|
- .gitignore
|
67
|
-
- .specification
|
68
57
|
- LICENSE
|
69
58
|
- README.rdoc
|
70
59
|
- Rakefile
|
@@ -78,6 +67,8 @@ files:
|
|
78
67
|
- test/helper.rb
|
79
68
|
- test/test_content_mapping.rb
|
80
69
|
- test/test_content_scrapper.rb
|
70
|
+
- test/test_pages.rb
|
71
|
+
- test/test_pages/cdata.html
|
81
72
|
- test/test_pages/encoding.html
|
82
73
|
- test/test_pages/pretty.html
|
83
74
|
- test/test_pages/twocontent.html
|
@@ -113,4 +104,5 @@ summary: Gem for those who want to screen scrap only the content part of web pag
|
|
113
104
|
test_files:
|
114
105
|
- test/test_content_mapping.rb
|
115
106
|
- test/test_content_scrapper.rb
|
107
|
+
- test/test_pages.rb
|
116
108
|
- test/helper.rb
|
data/.specification
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
2
|
-
name: content_scrapper
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 99.99.99
|
5
|
-
platform: ruby
|
6
|
-
authors:
|
7
|
-
- Gyorgy Frivolt
|
8
|
-
autorequire:
|
9
|
-
bindir: bin
|
10
|
-
cert_chain: []
|
11
|
-
|
12
|
-
date: 2010-02-13 00:00:00 +01:00
|
13
|
-
default_executable:
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
16
|
-
name: thoughtbot-shoulda
|
17
|
-
type: :development
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 2.10.2
|
24
|
-
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: mocha
|
27
|
-
type: :development
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 0.9.8
|
34
|
-
version:
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: sanitize
|
37
|
-
type: :runtime
|
38
|
-
version_requirement:
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 1.2.0
|
44
|
-
version:
|
45
|
-
- !ruby/object:Gem::Dependency
|
46
|
-
name: nokogiri
|
47
|
-
type: :runtime
|
48
|
-
version_requirement:
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
51
|
-
- - ">="
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: 1.4.1
|
54
|
-
version:
|
55
|
-
description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
|
56
|
-
email: gyorgy.frivolt@gmail.com
|
57
|
-
executables: []
|
58
|
-
|
59
|
-
extensions: []
|
60
|
-
|
61
|
-
extra_rdoc_files:
|
62
|
-
- LICENSE
|
63
|
-
- README.rdoc
|
64
|
-
files:
|
65
|
-
- .document
|
66
|
-
- .gitignore
|
67
|
-
- LICENSE
|
68
|
-
- README.rdoc
|
69
|
-
- Rakefile
|
70
|
-
- VERSION
|
71
|
-
- config/content_scrapper.rb
|
72
|
-
- content_scrapper.gemspec
|
73
|
-
- lib/content_scrapper.rb
|
74
|
-
- lib/content_scrapper/content_mapping.rb
|
75
|
-
- lib/content_scrapper/feedzirra.rb
|
76
|
-
- rails/init.rb
|
77
|
-
- test/helper.rb
|
78
|
-
- test/test_content_mapping.rb
|
79
|
-
- test/test_content_scrapper.rb
|
80
|
-
- test/test_pages/pretty.html
|
81
|
-
- test/test_pages/twocontent.html
|
82
|
-
- test/test_pages/ugly.html
|
83
|
-
has_rdoc: true
|
84
|
-
homepage: http://github.com/fifigyuri/content_scrapper
|
85
|
-
licenses: []
|
86
|
-
|
87
|
-
post_install_message:
|
88
|
-
rdoc_options:
|
89
|
-
- --charset=UTF-8
|
90
|
-
require_paths:
|
91
|
-
- bin
|
92
|
-
- lib
|
93
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
-
requirements:
|
95
|
-
- - ">="
|
96
|
-
- !ruby/object:Gem::Version
|
97
|
-
version: "0"
|
98
|
-
version:
|
99
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: "0"
|
104
|
-
version:
|
105
|
-
requirements: []
|
106
|
-
|
107
|
-
rubyforge_project:
|
108
|
-
rubygems_version: 1.3.5
|
109
|
-
signing_key:
|
110
|
-
specification_version: 3
|
111
|
-
summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
|
112
|
-
test_files:
|
113
|
-
- test/test_content_mapping.rb
|
114
|
-
- test/test_content_scrapper.rb
|
115
|
-
- test/helper.rb
|