content_scrapper 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/content_scrapper.gemspec +4 -5
- data/lib/content_scrapper.rb +17 -4
- data/lib/content_scrapper/content_mapping.rb +2 -1
- data/test/test_content_mapping.rb +2 -2
- data/test/test_content_scrapper.rb +18 -0
- data/test/test_pages.rb +29 -0
- data/test/test_pages/cdata.html +23 -0
- metadata +4 -12
- data/.specification +0 -115
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/content_scrapper.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
@@ -19,7 +19,6 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
".gitignore",
|
22
|
-
".specification",
|
23
22
|
"LICENSE",
|
24
23
|
"README.rdoc",
|
25
24
|
"Rakefile",
|
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
|
|
33
32
|
"test/helper.rb",
|
34
33
|
"test/test_content_mapping.rb",
|
35
34
|
"test/test_content_scrapper.rb",
|
35
|
+
"test/test_pages.rb",
|
36
|
+
"test/test_pages/cdata.html",
|
36
37
|
"test/test_pages/encoding.html",
|
37
38
|
"test/test_pages/pretty.html",
|
38
39
|
"test/test_pages/twocontent.html",
|
@@ -46,6 +47,7 @@ Gem::Specification.new do |s|
|
|
46
47
|
s.test_files = [
|
47
48
|
"test/test_content_mapping.rb",
|
48
49
|
"test/test_content_scrapper.rb",
|
50
|
+
"test/test_pages.rb",
|
49
51
|
"test/helper.rb"
|
50
52
|
]
|
51
53
|
|
@@ -56,18 +58,15 @@ Gem::Specification.new do |s|
|
|
56
58
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
57
59
|
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
58
60
|
s.add_development_dependency(%q<mocha>, [">= 0.9.8"])
|
59
|
-
s.add_runtime_dependency(%q<sanitize>, [">= 1.2.0"])
|
60
61
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
61
62
|
else
|
62
63
|
s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
63
64
|
s.add_dependency(%q<mocha>, [">= 0.9.8"])
|
64
|
-
s.add_dependency(%q<sanitize>, [">= 1.2.0"])
|
65
65
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
66
66
|
end
|
67
67
|
else
|
68
68
|
s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
|
69
69
|
s.add_dependency(%q<mocha>, [">= 0.9.8"])
|
70
|
-
s.add_dependency(%q<sanitize>, [">= 1.2.0"])
|
71
70
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
72
71
|
end
|
73
72
|
end
|
data/lib/content_scrapper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'nokogiri'
|
4
|
-
require 'sanitize'
|
5
4
|
|
6
5
|
require 'content_scrapper/content_mapping'
|
7
6
|
|
@@ -20,7 +19,7 @@ class ContentScrapper
|
|
20
19
|
ContentScrapper.default = self
|
21
20
|
end
|
22
21
|
|
23
|
-
|
22
|
+
attr_reader :content_mappings
|
24
23
|
|
25
24
|
def initialize(scrapper_config_file = nil)
|
26
25
|
@content_mappings = []
|
@@ -34,8 +33,22 @@ class ContentScrapper
|
|
34
33
|
@content_mappings << new_mapping
|
35
34
|
end
|
36
35
|
|
37
|
-
def
|
38
|
-
@
|
36
|
+
def clean_content(content)
|
37
|
+
@content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
|
38
|
+
end
|
39
|
+
|
40
|
+
def sanitize_tags(&sanitize_settings)
|
41
|
+
@content_cleaner_block = lambda do |content|
|
42
|
+
require 'sanitize'
|
43
|
+
Sanitize.clean(content, sanitize_settings.call())
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def loofah_tags(scrap_type)
|
48
|
+
@content_scrapper_block = lambda do |content|
|
49
|
+
require 'loofah'
|
50
|
+
Loofah.document(content).scrub!(scrap_type).to_s
|
51
|
+
end
|
39
52
|
end
|
40
53
|
|
41
54
|
def scrap_content(url)
|
@@ -29,10 +29,11 @@ class ContentMapping
|
|
29
29
|
@content_xpaths_list.each do |content_xpath|
|
30
30
|
content_section = doc.xpath(content_xpath)
|
31
31
|
content = content_section.to_a.join("\n")
|
32
|
-
content =
|
32
|
+
content = content_scrapper.clean_content(content) unless content_scrapper.nil?
|
33
33
|
content = Iconv.conv(to=iconv_to, from=iconv_from, content) unless iconv_to.nil?
|
34
34
|
return content if content_section.count > 0
|
35
35
|
end
|
36
36
|
nil
|
37
37
|
end
|
38
38
|
end
|
39
|
+
|
@@ -9,7 +9,7 @@ class TestContentMapping < Test::Unit::TestCase
|
|
9
9
|
@mapping = ContentMapping.new
|
10
10
|
@mapping.instance_eval do
|
11
11
|
url_pattern /^http:\/\/www\.matchme\.com\//
|
12
|
-
|
12
|
+
content_at '//div[@id="failing_content"]'
|
13
13
|
content_at '//div[@id="itext_content"]'
|
14
14
|
content_at '//div[@id="itext_second_content"]'
|
15
15
|
end
|
@@ -46,7 +46,7 @@ class TestContentMapping < Test::Unit::TestCase
|
|
46
46
|
@mapping = ContentMapping.new
|
47
47
|
@mapping.instance_eval do
|
48
48
|
url_pattern /^http:\/\/www\.matchme\.com\//
|
49
|
-
|
49
|
+
content_at '//div[@class="node node-story"]/div[@class="content"]/p'
|
50
50
|
iconv :to => 'utf8', :from => 'latin1'
|
51
51
|
end
|
52
52
|
page = File.open("#{File.dirname(__FILE__)}/test_pages/encoding.html").read
|
@@ -5,6 +5,24 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
5
5
|
|
6
6
|
ContentScrapper.default_config_file = nil
|
7
7
|
|
8
|
+
context "on settings without sanitization tags" do
|
9
|
+
setup do
|
10
|
+
@scrapper = ContentScrapper.new
|
11
|
+
@scrapper.instance_eval do
|
12
|
+
content_mapping do
|
13
|
+
url_pattern /.*/
|
14
|
+
content_at '//div[@id="itext_content"]'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
18
|
+
stringio = StringIO.new(content)
|
19
|
+
Kernel.expects(:open).returns(stringio)
|
20
|
+
end
|
21
|
+
should 'not sanitize' do
|
22
|
+
assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
8
26
|
context "on common settings" do
|
9
27
|
setup do
|
10
28
|
@scrapper = ContentScrapper.new
|
data/test/test_pages.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
|
2
|
+
require 'helper'
|
3
|
+
require 'mocha'
|
4
|
+
|
5
|
+
class TestContentScrapper < Test::Unit::TestCase
|
6
|
+
|
7
|
+
context "on page containing CDATA" do
|
8
|
+
setup do
|
9
|
+
@scrapper = ContentScrapper.new
|
10
|
+
@scrapper.instance_eval do
|
11
|
+
content_mapping do
|
12
|
+
url_pattern /.*/
|
13
|
+
content_at '//div[@class="art-full adwords-text"]'
|
14
|
+
end
|
15
|
+
loofah_tags(:strip)
|
16
|
+
end
|
17
|
+
@scrapper.rescue_scrapping do |exception|
|
18
|
+
puts exception
|
19
|
+
end
|
20
|
+
cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
|
21
|
+
Kernel.expects(:open).returns(StringIO.new(cdata_content))
|
22
|
+
end
|
23
|
+
should "not escape the cdata entries, should leave cdata unvisible" do
|
24
|
+
#<!--<![CDATA[
|
25
|
+
assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<body>
|
3
|
+
<div class="art-full adwords-text">
|
4
|
+
<h1>Jud�nyov�: Celebrity sveta m�dy za�ila na vlastnej ko�i</h1>
|
5
|
+
<div id="zoom" class="zoom2">
|
6
|
+
<div class="art-info">Pravda.sk |
|
7
|
+
<span class="time-date">25. febru�ra 2010 5:51</span>
|
8
|
+
</div>
|
9
|
+
<div class="perex" id="article-perex">Moder�torka Erika Jud�nyov� sa ned�vno vr�tila z v�letu v New Yorku. Z��astnila sa tam na m�dnom t��dni a s telev�znym �t�bom aj nieko�ko prehliadok. </div>
|
10
|
+
<ul>
|
11
|
+
<li class="fotogaleria">
|
12
|
+
<a href="/foto.asp?r=sk-kkoktail&c=A100224_165338_sk-kkoktail_p20">
|
13
|
+
Gal�ria: Erika Jud�nyov�</a>
|
14
|
+
<script type="text/javascript"><!--<![CDATA[
|
15
|
+
/* SLAVE: perex_sk.perex_sk.perex.koktail.center */
|
16
|
+
ado.slave('adoceanskqdisnunpvu', {myMaster: 'uikHnAPTNwh_AVZX4uAdPP6xUQPhUSb01rCKlMcgapn.97' });
|
17
|
+
//]]>--></script>
|
18
|
+
<!-- [/Koktail/CENTER] -->
|
19
|
+
</div><p>"Videli sme napr�klad �ou Custo Barcelona alebo prehliadku Very Wangovej," prezradila pre Pravda.sk moder�torka Smot�nky. Spolu so �t�bom zaznamen�vala dianie. Jeden z najzauj�mavej��ch �lovkov pre kameru mark�zackej rel�cie bola ��fredaktorka magaz�nu Vogue Anna Wintour. "Pri�la na prehliadku, nasadila si tmav� okuliare a potichu z�vala. �o ma v�ak najviac prekvapilo, nezostala do konca prehliadky, asi tri min�ty pred koncom sa pov��enecky zdvihla a odi�la," zaspom�nala si Jud�nyov�.</p>
|
20
|
+
</li>
|
21
|
+
</ul>
|
22
|
+
</body>
|
23
|
+
</html>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -32,16 +32,6 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.9.8
|
34
34
|
version:
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: sanitize
|
37
|
-
type: :runtime
|
38
|
-
version_requirement:
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 1.2.0
|
44
|
-
version:
|
45
35
|
- !ruby/object:Gem::Dependency
|
46
36
|
name: nokogiri
|
47
37
|
type: :runtime
|
@@ -64,7 +54,6 @@ extra_rdoc_files:
|
|
64
54
|
files:
|
65
55
|
- .document
|
66
56
|
- .gitignore
|
67
|
-
- .specification
|
68
57
|
- LICENSE
|
69
58
|
- README.rdoc
|
70
59
|
- Rakefile
|
@@ -78,6 +67,8 @@ files:
|
|
78
67
|
- test/helper.rb
|
79
68
|
- test/test_content_mapping.rb
|
80
69
|
- test/test_content_scrapper.rb
|
70
|
+
- test/test_pages.rb
|
71
|
+
- test/test_pages/cdata.html
|
81
72
|
- test/test_pages/encoding.html
|
82
73
|
- test/test_pages/pretty.html
|
83
74
|
- test/test_pages/twocontent.html
|
@@ -113,4 +104,5 @@ summary: Gem for those who want to screen scrap only the content part of web pag
|
|
113
104
|
test_files:
|
114
105
|
- test/test_content_mapping.rb
|
115
106
|
- test/test_content_scrapper.rb
|
107
|
+
- test/test_pages.rb
|
116
108
|
- test/helper.rb
|
data/.specification
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
2
|
-
name: content_scrapper
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 99.99.99
|
5
|
-
platform: ruby
|
6
|
-
authors:
|
7
|
-
- Gyorgy Frivolt
|
8
|
-
autorequire:
|
9
|
-
bindir: bin
|
10
|
-
cert_chain: []
|
11
|
-
|
12
|
-
date: 2010-02-13 00:00:00 +01:00
|
13
|
-
default_executable:
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
16
|
-
name: thoughtbot-shoulda
|
17
|
-
type: :development
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 2.10.2
|
24
|
-
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: mocha
|
27
|
-
type: :development
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 0.9.8
|
34
|
-
version:
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: sanitize
|
37
|
-
type: :runtime
|
38
|
-
version_requirement:
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 1.2.0
|
44
|
-
version:
|
45
|
-
- !ruby/object:Gem::Dependency
|
46
|
-
name: nokogiri
|
47
|
-
type: :runtime
|
48
|
-
version_requirement:
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
51
|
-
- - ">="
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: 1.4.1
|
54
|
-
version:
|
55
|
-
description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
|
56
|
-
email: gyorgy.frivolt@gmail.com
|
57
|
-
executables: []
|
58
|
-
|
59
|
-
extensions: []
|
60
|
-
|
61
|
-
extra_rdoc_files:
|
62
|
-
- LICENSE
|
63
|
-
- README.rdoc
|
64
|
-
files:
|
65
|
-
- .document
|
66
|
-
- .gitignore
|
67
|
-
- LICENSE
|
68
|
-
- README.rdoc
|
69
|
-
- Rakefile
|
70
|
-
- VERSION
|
71
|
-
- config/content_scrapper.rb
|
72
|
-
- content_scrapper.gemspec
|
73
|
-
- lib/content_scrapper.rb
|
74
|
-
- lib/content_scrapper/content_mapping.rb
|
75
|
-
- lib/content_scrapper/feedzirra.rb
|
76
|
-
- rails/init.rb
|
77
|
-
- test/helper.rb
|
78
|
-
- test/test_content_mapping.rb
|
79
|
-
- test/test_content_scrapper.rb
|
80
|
-
- test/test_pages/pretty.html
|
81
|
-
- test/test_pages/twocontent.html
|
82
|
-
- test/test_pages/ugly.html
|
83
|
-
has_rdoc: true
|
84
|
-
homepage: http://github.com/fifigyuri/content_scrapper
|
85
|
-
licenses: []
|
86
|
-
|
87
|
-
post_install_message:
|
88
|
-
rdoc_options:
|
89
|
-
- --charset=UTF-8
|
90
|
-
require_paths:
|
91
|
-
- bin
|
92
|
-
- lib
|
93
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
-
requirements:
|
95
|
-
- - ">="
|
96
|
-
- !ruby/object:Gem::Version
|
97
|
-
version: "0"
|
98
|
-
version:
|
99
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: "0"
|
104
|
-
version:
|
105
|
-
requirements: []
|
106
|
-
|
107
|
-
rubyforge_project:
|
108
|
-
rubygems_version: 1.3.5
|
109
|
-
signing_key:
|
110
|
-
specification_version: 3
|
111
|
-
summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
|
112
|
-
test_files:
|
113
|
-
- test/test_content_mapping.rb
|
114
|
-
- test/test_content_scrapper.rb
|
115
|
-
- test/helper.rb
|