mechanize-content 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
@@ -2,14 +2,12 @@ require 'rubygems'
2
2
  require 'mechanize'
3
3
  require 'image_size'
4
4
  require 'open-uri'
5
+ require 'mechanize-content/util'
5
6
 
6
7
  class MechanizeContent
7
8
 
8
9
  attr_accessor :urls
9
-
10
- MIN_WIDTH = 64
11
- MIN_HEIGHT = 64
12
-
10
+
13
11
  def initialize(*args)
14
12
  @urls = *args
15
13
  end
@@ -37,7 +35,7 @@ class MechanizeContent
37
35
  def fetch_texts
38
36
  (@pages || fetch_pages).each do |page|
39
37
  text = fetch_text(page)
40
- return @best_text = text unless text.nil?
38
+ return @best_text = text unless text.nil? || text.empty?
41
39
  end
42
40
  return nil
43
41
  end
@@ -126,11 +124,20 @@ class MechanizeContent
126
124
  if paragraph.inner_text().length > 10
127
125
  readability[paragraph.parent] += 1
128
126
  end
127
+ if !paragraph.parent.attributes.values.nil?
128
+ if !paragraph.parent.attributes.values.first.nil?
129
+ if paragraph.parent.attributes.values.first.value.include? "comment"
130
+ break
131
+ end
132
+ end
133
+ end
129
134
  readability[paragraph.parent] += paragraph.inner_text().count(',')
130
135
  end
131
136
  sorted_results = readability.sort_by { |parent,score| -score }
132
137
  if sorted_results.nil? || sorted_results.first.nil?
133
138
  return nil
139
+ elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
140
+ return nil
134
141
  else
135
142
  top_result = sorted_results.first.first
136
143
  top_result.css('script').unlink
@@ -140,57 +147,31 @@ class MechanizeContent
140
147
  return top_result
141
148
  end
142
149
  end
143
-
144
- def get_base_url(doc, url)
145
- base_url = doc.xpath("//base/@href").first
146
- if base_url.nil?
147
- return url
148
- else
149
- return base_url.value
150
- end
151
- end
152
-
150
+
153
151
  def fetch_image(page)
154
152
  top_content = fetch_content(page)
155
153
  if top_content
156
- return find_best_image(top_content.css('img'), get_base_url(page.parser, page.uri))
154
+ return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
157
155
  else
158
156
  return nil
159
157
  end
160
158
  end
161
-
162
- def valid_image?(width, height, src)
163
- if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
164
- if (!(width == 728) && !(height == 90))
165
- return true
166
- end
167
- end
168
- return false
169
- end
170
-
171
- def build_absolute_url(current_src, url)
172
- uri = URI.parse(current_src)
173
- if uri.relative?
174
- current_src = (URI.parse(url.to_s)+current_src).to_s
175
- end
176
- current_src
177
- end
178
-
159
+
179
160
  def find_best_image(all_images, url)
180
161
  begin
181
162
  current_src = nil
182
163
  all_images.each do |img|
183
164
  current_src = img["src"]
184
- if valid_image?(img['width'].to_i, img['height'].to_i, current_src)
185
- return build_absolute_url(current_src, url)
165
+ if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
166
+ return Util.build_absolute_url(current_src, url)
186
167
  end
187
168
  end
188
169
  all_images.each do |img|
189
170
  current_src = img["src"]
190
- current_src = build_absolute_url(current_src, url)
171
+ current_src = Util.build_absolute_url(current_src, url)
191
172
  open(current_src, "rb") do |fh|
192
173
  is = ImageSize.new(fh.read)
193
- if valid_image?(is.width, is.height, current_src)
174
+ if Util.valid_image?(is.width, is.height, current_src)
194
175
  return current_src
195
176
  end
196
177
  end
@@ -0,0 +1,37 @@
1
+ class MechanizeContent
2
+
3
+ class Util
4
+
5
+ MIN_WIDTH = 64
6
+ MIN_HEIGHT = 64
7
+ AD_WIDTH = 728
8
+ AD_HEIGHT = 90
9
+
10
+ def self.get_base_url(doc, url)
11
+ base_url = doc.xpath("//base/@href").first
12
+ if base_url.nil?
13
+ return url
14
+ else
15
+ return base_url.value
16
+ end
17
+ end
18
+
19
+ def self.build_absolute_url(current_src, url)
20
+ if URI.parse(current_src).relative?
21
+ current_src = (URI.parse(url.to_s)+current_src).to_s
22
+ end
23
+ current_src
24
+ end
25
+
26
+ def self.valid_image?(width, height, src)
27
+ if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
28
+ if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
29
+ return true
30
+ end
31
+ end
32
+ return false
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{mechanize-content}
8
- s.version = "0.1.1"
8
+ s.version = "0.1.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John Griffin"]
12
- s.date = %q{2010-03-19}
12
+ s.date = %q{2010-03-23}
13
13
  s.description = %q{pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content}
14
14
  s.email = %q{johnog@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
24
24
  "Rakefile",
25
25
  "VERSION",
26
26
  "lib/mechanize-content.rb",
27
+ "lib/mechanize-content/util.rb",
27
28
  "mechanize-content.gemspec",
28
29
  "spec/mechanize-content_spec.rb",
29
30
  "spec/spec.opts",
@@ -75,32 +75,32 @@ describe "MechanizeContent" do
75
75
 
76
76
  it "reject all gifs" do
77
77
  mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
78
- mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/ablank.gif2").should eql(false)
78
+ MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/ablank.gif2").should eql(false)
79
79
  end
80
80
 
81
81
  it "reject image with banner in the name" do
82
82
  mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
83
- mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/banner.png").should eql(false)
83
+ MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/banner.png").should eql(false)
84
84
  end
85
85
 
86
86
  it "reject image that is too small" do
87
87
  mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
88
- mc.valid_image?(64, 500, "http://www.cmpevents.com/GD10/toosmall.png").should eql(false)
88
+ MechanizeContent::Util.valid_image?(64, 500, "http://www.cmpevents.com/GD10/toosmall.png").should eql(false)
89
89
  end
90
90
 
91
91
  it "allow good images" do
92
92
  mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
93
- mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/perfecto.png").should eql(true)
93
+ MechanizeContent::Util.valid_image?(500, 500, "http://www.cmpevents.com/GD10/perfecto.png").should eql(true)
94
94
  end
95
95
 
96
96
  it "build a base url for images" do
97
97
  mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
98
98
  page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
99
- mc.get_base_url(page.parser, page.uri).to_s.should eql("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
99
+ MechanizeContent::Util.get_base_url(page.parser, page.uri).to_s.should eql("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
100
100
 
101
101
  mc = MechanizeContent.new("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
102
102
  page = mc.fetch_page("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
103
- mc.get_base_url(page.parser, page.uri).to_s.should eql("http://www.mutinydesign.co.uk/")
103
+ MechanizeContent::Util.get_base_url(page.parser, page.uri).to_s.should eql("http://www.mutinydesign.co.uk/")
104
104
  end
105
105
 
106
106
  it "find image" do
@@ -146,4 +146,28 @@ describe "MechanizeContent" do
146
146
  mc.best_image.should eql("http://assets.vg247.com/current//2010/03/rockbandlogo.jpg")
147
147
  end
148
148
 
149
+ it "gog link no text" do
150
+ mc = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition", "http://www.destructoid.com/-nuff-said-good-old-games-gets-another-world-168150.phtml", "http://www.joystiq.com/2010/03/18/another-world-15th-anniversary-edition-now-on-gog-com/")
151
+ mc.best_title.should eql("Another World: 15th Anniversary Edition - GOG.com")
152
+ mc.best_text.should eql("Another World -- or Out of this World, as many of you will know it by -- is now on DRM-free digital distribution service Good Old Games. It can be had for $9.99. Need I say more?\rI love the game, even though I have never made it more than oh, five minutes in. It's more or less universally loved by the Destructoid staff. Not long after we got an email detailing the good news, the thread soon reached fifteen or so replies full of praise for the game.\rOther, less exciting recent releases include: Empire Earth II Gold, Gabriel Knight 3, and Aquanox. Not to completely s**t on these games, but this is Another World we're talking about here.")
153
+ mc.best_image.should eql("http://www.blogcdn.com/www.joystiq.com/media/2010/03/anotherworldheaderimg580px223.jpg")
154
+ end
155
+
156
+ it "getting wrong blurb from detructoid" do
157
+ mc = MechanizeContent.new("http://www.destructoid.com/-nuff-said-good-old-games-gets-another-world-168150.phtml")
158
+ mc.best_title.should eql("Destructoid - 'Nuff said: Good Old Games gets Another World")
159
+ mc.best_text.should eql("Another World -- or Out of this World, as many of you will know it by -- is now on DRM-free digital distribution service Good Old Games. It can be had for $9.99. Need I say more?\rI love the game, even though I have never made it more than oh, five minutes in. It's more or less universally loved by the Destructoid staff. Not long after we got an email detailing the good news, the thread soon reached fifteen or so replies full of praise for the game.\rOther, less exciting recent releases include: Empire Earth II Gold, Gabriel Knight 3, and Aquanox. Not to completely s**t on these games, but this is Another World we're talking about here.")
160
+ mc.best_image.should eql(nil)
161
+ end
162
+
163
+ it "avoid using copy from flash sites" do
164
+ mc = MechanizeContent.new("http://www.godofwar.com/spartansstandtall/")
165
+ mc.best_text.should eql(nil)
166
+ end
167
+
168
+ it "get this flash site to return nil for a title" do
169
+ mc = MechanizeContent.new("http://www.thewitcher.com/")
170
+ mc.best_text.should eql(nil)
171
+ end
172
+
149
173
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 1
9
- version: 0.1.1
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - John Griffin
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-19 00:00:00 +00:00
17
+ date: 2010-03-23 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -62,6 +62,7 @@ files:
62
62
  - Rakefile
63
63
  - VERSION
64
64
  - lib/mechanize-content.rb
65
+ - lib/mechanize-content/util.rb
65
66
  - mechanize-content.gemspec
66
67
  - spec/mechanize-content_spec.rb
67
68
  - spec/spec.opts