mechanize-content 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/mechanize-content.rb +19 -38
- data/lib/mechanize-content/util.rb +37 -0
- data/mechanize-content.gemspec +3 -2
- data/spec/mechanize-content_spec.rb +30 -6
- metadata +4 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
data/lib/mechanize-content.rb
CHANGED
@@ -2,14 +2,12 @@ require 'rubygems'
|
|
2
2
|
require 'mechanize'
|
3
3
|
require 'image_size'
|
4
4
|
require 'open-uri'
|
5
|
+
require 'mechanize-content/util'
|
5
6
|
|
6
7
|
class MechanizeContent
|
7
8
|
|
8
9
|
attr_accessor :urls
|
9
|
-
|
10
|
-
MIN_WIDTH = 64
|
11
|
-
MIN_HEIGHT = 64
|
12
|
-
|
10
|
+
|
13
11
|
def initialize(*args)
|
14
12
|
@urls = *args
|
15
13
|
end
|
@@ -37,7 +35,7 @@ class MechanizeContent
|
|
37
35
|
def fetch_texts
|
38
36
|
(@pages || fetch_pages).each do |page|
|
39
37
|
text = fetch_text(page)
|
40
|
-
return @best_text = text unless text.nil?
|
38
|
+
return @best_text = text unless text.nil? || text.empty?
|
41
39
|
end
|
42
40
|
return nil
|
43
41
|
end
|
@@ -126,11 +124,20 @@ class MechanizeContent
|
|
126
124
|
if paragraph.inner_text().length > 10
|
127
125
|
readability[paragraph.parent] += 1
|
128
126
|
end
|
127
|
+
if !paragraph.parent.attributes.values.nil?
|
128
|
+
if !paragraph.parent.attributes.values.first.nil?
|
129
|
+
if paragraph.parent.attributes.values.first.value.include? "comment"
|
130
|
+
break
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
129
134
|
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
130
135
|
end
|
131
136
|
sorted_results = readability.sort_by { |parent,score| -score }
|
132
137
|
if sorted_results.nil? || sorted_results.first.nil?
|
133
138
|
return nil
|
139
|
+
elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
|
140
|
+
return nil
|
134
141
|
else
|
135
142
|
top_result = sorted_results.first.first
|
136
143
|
top_result.css('script').unlink
|
@@ -140,57 +147,31 @@ class MechanizeContent
|
|
140
147
|
return top_result
|
141
148
|
end
|
142
149
|
end
|
143
|
-
|
144
|
-
def get_base_url(doc, url)
|
145
|
-
base_url = doc.xpath("//base/@href").first
|
146
|
-
if base_url.nil?
|
147
|
-
return url
|
148
|
-
else
|
149
|
-
return base_url.value
|
150
|
-
end
|
151
|
-
end
|
152
|
-
|
150
|
+
|
153
151
|
def fetch_image(page)
|
154
152
|
top_content = fetch_content(page)
|
155
153
|
if top_content
|
156
|
-
return find_best_image(top_content.css('img'), get_base_url(page.parser, page.uri))
|
154
|
+
return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
|
157
155
|
else
|
158
156
|
return nil
|
159
157
|
end
|
160
158
|
end
|
161
|
-
|
162
|
-
def valid_image?(width, height, src)
|
163
|
-
if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
|
164
|
-
if (!(width == 728) && !(height == 90))
|
165
|
-
return true
|
166
|
-
end
|
167
|
-
end
|
168
|
-
return false
|
169
|
-
end
|
170
|
-
|
171
|
-
def build_absolute_url(current_src, url)
|
172
|
-
uri = URI.parse(current_src)
|
173
|
-
if uri.relative?
|
174
|
-
current_src = (URI.parse(url.to_s)+current_src).to_s
|
175
|
-
end
|
176
|
-
current_src
|
177
|
-
end
|
178
|
-
|
159
|
+
|
179
160
|
def find_best_image(all_images, url)
|
180
161
|
begin
|
181
162
|
current_src = nil
|
182
163
|
all_images.each do |img|
|
183
164
|
current_src = img["src"]
|
184
|
-
if valid_image?(img['width'].to_i, img['height'].to_i, current_src)
|
185
|
-
return build_absolute_url(current_src, url)
|
165
|
+
if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
|
166
|
+
return Util.build_absolute_url(current_src, url)
|
186
167
|
end
|
187
168
|
end
|
188
169
|
all_images.each do |img|
|
189
170
|
current_src = img["src"]
|
190
|
-
current_src = build_absolute_url(current_src, url)
|
171
|
+
current_src = Util.build_absolute_url(current_src, url)
|
191
172
|
open(current_src, "rb") do |fh|
|
192
173
|
is = ImageSize.new(fh.read)
|
193
|
-
if valid_image?(is.width, is.height, current_src)
|
174
|
+
if Util.valid_image?(is.width, is.height, current_src)
|
194
175
|
return current_src
|
195
176
|
end
|
196
177
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class MechanizeContent
|
2
|
+
|
3
|
+
class Util
|
4
|
+
|
5
|
+
MIN_WIDTH = 64
|
6
|
+
MIN_HEIGHT = 64
|
7
|
+
AD_WIDTH = 728
|
8
|
+
AD_HEIGHT = 90
|
9
|
+
|
10
|
+
def self.get_base_url(doc, url)
|
11
|
+
base_url = doc.xpath("//base/@href").first
|
12
|
+
if base_url.nil?
|
13
|
+
return url
|
14
|
+
else
|
15
|
+
return base_url.value
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.build_absolute_url(current_src, url)
|
20
|
+
if URI.parse(current_src).relative?
|
21
|
+
current_src = (URI.parse(url.to_s)+current_src).to_s
|
22
|
+
end
|
23
|
+
current_src
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.valid_image?(width, height, src)
|
27
|
+
if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
|
28
|
+
if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
|
29
|
+
return true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return false
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/mechanize-content.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{mechanize-content}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John Griffin"]
|
12
|
-
s.date = %q{2010-03-
|
12
|
+
s.date = %q{2010-03-23}
|
13
13
|
s.description = %q{pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content}
|
14
14
|
s.email = %q{johnog@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"lib/mechanize-content.rb",
|
27
|
+
"lib/mechanize-content/util.rb",
|
27
28
|
"mechanize-content.gemspec",
|
28
29
|
"spec/mechanize-content_spec.rb",
|
29
30
|
"spec/spec.opts",
|
@@ -75,32 +75,32 @@ describe "MechanizeContent" do
|
|
75
75
|
|
76
76
|
it "reject all gifs" do
|
77
77
|
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
78
|
-
|
78
|
+
MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/ablank.gif2").should eql(false)
|
79
79
|
end
|
80
80
|
|
81
81
|
it "reject image with banner in the name" do
|
82
82
|
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
83
|
-
|
83
|
+
MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/banner.png").should eql(false)
|
84
84
|
end
|
85
85
|
|
86
86
|
it "reject image that is too small" do
|
87
87
|
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
88
|
-
|
88
|
+
MechanizeContent::Util.valid_image?(64, 500, "http://www.cmpevents.com/GD10/toosmall.png").should eql(false)
|
89
89
|
end
|
90
90
|
|
91
91
|
it "allow good images" do
|
92
92
|
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
93
|
-
|
93
|
+
MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/perfecto.png").should eql(true)
|
94
94
|
end
|
95
95
|
|
96
96
|
it "build a base url for images" do
|
97
97
|
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
98
98
|
page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
99
|
-
|
99
|
+
MechanizeContent::Util.get_base_url(page.parser, page.uri).to_s.should eql("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
100
100
|
|
101
101
|
mc = MechanizeContent.new("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
|
102
102
|
page = mc.fetch_page("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
|
103
|
-
|
103
|
+
MechanizeContent::Util.get_base_url(page.parser, page.uri).to_s.should eql("http://www.mutinydesign.co.uk/")
|
104
104
|
end
|
105
105
|
|
106
106
|
it "find image" do
|
@@ -146,4 +146,28 @@ describe "MechanizeContent" do
|
|
146
146
|
mc.best_image.should eql("http://assets.vg247.com/current//2010/03/rockbandlogo.jpg")
|
147
147
|
end
|
148
148
|
|
149
|
+
it "gog link no text" do
|
150
|
+
mc = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition", "http://www.destructoid.com/-nuff-said-good-old-games-gets-another-world-168150.phtml", "http://www.joystiq.com/2010/03/18/another-world-15th-anniversary-edition-now-on-gog-com/")
|
151
|
+
mc.best_title.should eql("Another World: 15th Anniversary Edition - GOG.com")
|
152
|
+
mc.best_text.should eql("Another World -- or Out of this World, as many of you will know it by -- is now on DRM-free digital distribution service Good Old Games. It can be had for $9.99. Need I say more?\rI love the game, even though I have never made it more than oh, five minutes in. It's more or less universally loved by the Destructoid staff. Not long after we got an email detailing the good news, the thread soon reached fifteen or so replies full of praise for the game.\rOther, less exciting recent releases include: Empire Earth II Gold, Gabriel Knight 3, and Aquanox. Not to completely s**t on these games, but this is Another World we're talking about here.")
|
153
|
+
mc.best_image.should eql("http://www.blogcdn.com/www.joystiq.com/media/2010/03/anotherworldheaderimg580px223.jpg")
|
154
|
+
end
|
155
|
+
|
156
|
+
it "getting wrong blurb from detructoid" do
|
157
|
+
mc = MechanizeContent.new("http://www.destructoid.com/-nuff-said-good-old-games-gets-another-world-168150.phtml")
|
158
|
+
mc.best_title.should eql("Destructoid - 'Nuff said: Good Old Games gets Another World")
|
159
|
+
mc.best_text.should eql("Another World -- or Out of this World, as many of you will know it by -- is now on DRM-free digital distribution service Good Old Games. It can be had for $9.99. Need I say more?\rI love the game, even though I have never made it more than oh, five minutes in. It's more or less universally loved by the Destructoid staff. Not long after we got an email detailing the good news, the thread soon reached fifteen or so replies full of praise for the game.\rOther, less exciting recent releases include: Empire Earth II Gold, Gabriel Knight 3, and Aquanox. Not to completely s**t on these games, but this is Another World we're talking about here.")
|
160
|
+
mc.best_image.should eql(nil)
|
161
|
+
end
|
162
|
+
|
163
|
+
it "avoid using copy from flash sites" do
|
164
|
+
mc = MechanizeContent.new("http://www.godofwar.com/spartansstandtall/")
|
165
|
+
mc.best_text.should eql(nil)
|
166
|
+
end
|
167
|
+
|
168
|
+
it "get this flash site to return nil for a title" do
|
169
|
+
mc = MechanizeContent.new("http://www.thewitcher.com/")
|
170
|
+
mc.best_text.should eql(nil)
|
171
|
+
end
|
172
|
+
|
149
173
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John Griffin
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-03-
|
17
|
+
date: 2010-03-23 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- Rakefile
|
63
63
|
- VERSION
|
64
64
|
- lib/mechanize-content.rb
|
65
|
+
- lib/mechanize-content/util.rb
|
65
66
|
- mechanize-content.gemspec
|
66
67
|
- spec/mechanize-content_spec.rb
|
67
68
|
- spec/spec.opts
|