mechanize-content 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/mechanize-content.rb +206 -0
- data/spec/mechanize-content_spec.rb +149 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +86 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 John Griffin
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
= mechanize-content
|
2
|
+
|
3
|
+
pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content
|
4
|
+
|
5
|
+
== Note on Patches/Pull Requests
|
6
|
+
|
7
|
+
* Fork the project.
|
8
|
+
* Make your feature addition or bug fix.
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
10
|
+
future version unintentionally.
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
12
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
13
|
+
* Send me a pull request. Bonus points for topic branches.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2010 John Griffin. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "mechanize-content"
|
8
|
+
gem.summary = %Q{scrape the best content from a page}
|
9
|
+
gem.description = %Q{pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content}
|
10
|
+
gem.email = "johnog@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/john-griffin/mechanize-content"
|
12
|
+
gem.authors = ["John Griffin"]
|
13
|
+
gem.add_dependency('mechanize', '>= 1.0.0')
|
14
|
+
end
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'spec/rake/spectask'
|
21
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
22
|
+
spec.libs << 'lib' << 'spec'
|
23
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
24
|
+
spec.spec_opts = ["--debugger"]
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "mechanize-content #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'image_size'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
class MechanizeContent
|
7
|
+
|
8
|
+
attr_accessor :urls
|
9
|
+
|
10
|
+
MIN_WIDTH = 64
|
11
|
+
MIN_HEIGHT = 64
|
12
|
+
|
13
|
+
def initialize(*args)
|
14
|
+
@urls = *args
|
15
|
+
end
|
16
|
+
|
17
|
+
def best_title
|
18
|
+
@best_title || fetch_titles
|
19
|
+
end
|
20
|
+
|
21
|
+
def best_text
|
22
|
+
@best_text || fetch_texts
|
23
|
+
end
|
24
|
+
|
25
|
+
def best_image
|
26
|
+
@best_image || fetch_images
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_images
|
30
|
+
(@pages || fetch_pages).each do |page|
|
31
|
+
image = fetch_image(page)
|
32
|
+
return @best_image = image unless image.nil?
|
33
|
+
end
|
34
|
+
return nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def fetch_texts
|
38
|
+
(@pages || fetch_pages).each do |page|
|
39
|
+
text = fetch_text(page)
|
40
|
+
return @best_text = text unless text.nil?
|
41
|
+
end
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def fetch_titles
|
46
|
+
(@pages || fetch_pages).each do |page|
|
47
|
+
title = page.title
|
48
|
+
unless title.nil?
|
49
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
50
|
+
title = ic.iconv(title + ' ')[0..-2]
|
51
|
+
return @best_title = title
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
return @urls.first
|
56
|
+
end
|
57
|
+
|
58
|
+
def fetch_pages
|
59
|
+
@pages = []
|
60
|
+
@urls.each do |url|
|
61
|
+
page = fetch_page(url)
|
62
|
+
@pages << page unless page.nil?
|
63
|
+
end
|
64
|
+
@pages
|
65
|
+
end
|
66
|
+
|
67
|
+
def fetch_page(url)
|
68
|
+
begin
|
69
|
+
page = (@agent || init_agent).get(url)
|
70
|
+
if page.class == Mechanize::Page
|
71
|
+
return page
|
72
|
+
else
|
73
|
+
return nil
|
74
|
+
end
|
75
|
+
rescue Timeout::Error
|
76
|
+
puts "Timeout - "+url
|
77
|
+
rescue Errno::ECONNRESET
|
78
|
+
puts "Connection reset by peer - "+url
|
79
|
+
rescue Mechanize::ResponseCodeError
|
80
|
+
puts "Invalid url"
|
81
|
+
rescue Mechanize::UnsupportedSchemeError
|
82
|
+
puts "Unsupported Scheme"
|
83
|
+
rescue
|
84
|
+
puts "There was a problem connecting - "+url
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def init_agent
|
89
|
+
agent = Mechanize.new
|
90
|
+
agent.user_agent_alias = 'Mac Safari'
|
91
|
+
return @agent = agent
|
92
|
+
end
|
93
|
+
|
94
|
+
def fetch_text(page)
|
95
|
+
top_content = fetch_content(page)
|
96
|
+
if top_content
|
97
|
+
text = top_content.text.delete("\t").delete("\n").strip
|
98
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
99
|
+
text = ic.iconv(text + ' ')[0..-2]
|
100
|
+
else
|
101
|
+
return nil
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def fetch_content(page)
|
106
|
+
doc = page.parser
|
107
|
+
readability = {}
|
108
|
+
doc.css('p').each do |paragraph|
|
109
|
+
if readability[paragraph.parent].nil?
|
110
|
+
readability[paragraph.parent] = 0
|
111
|
+
end
|
112
|
+
parent_class = paragraph.parent['class'] || ""
|
113
|
+
parent_id = paragraph.parent['id'] || ""
|
114
|
+
if !parent_class.match('(comment|meta|footer|footnote)').nil?
|
115
|
+
readability[paragraph.parent] -= 50
|
116
|
+
elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
117
|
+
readability[paragraph.parent] += 25
|
118
|
+
end
|
119
|
+
|
120
|
+
if !parent_id.match('(comment|meta|footer|footnote)').nil?
|
121
|
+
readability[paragraph.parent] -= 50
|
122
|
+
elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
123
|
+
readability[paragraph.parent] += 25
|
124
|
+
end
|
125
|
+
|
126
|
+
if paragraph.inner_text().length > 10
|
127
|
+
readability[paragraph.parent] += 1
|
128
|
+
end
|
129
|
+
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
130
|
+
end
|
131
|
+
sorted_results = readability.sort_by { |parent,score| -score }
|
132
|
+
if sorted_results.nil? || sorted_results.first.nil?
|
133
|
+
return nil
|
134
|
+
else
|
135
|
+
top_result = sorted_results.first.first
|
136
|
+
top_result.css('script').unlink
|
137
|
+
top_result.css('iframe').unlink
|
138
|
+
top_result.css('h1').unlink
|
139
|
+
top_result.css('h2').unlink
|
140
|
+
return top_result
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def get_base_url(doc, url)
|
145
|
+
base_url = doc.xpath("//base/@href").first
|
146
|
+
if base_url.nil?
|
147
|
+
return url
|
148
|
+
else
|
149
|
+
return base_url.value
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def fetch_image(page)
|
154
|
+
top_content = fetch_content(page)
|
155
|
+
if top_content
|
156
|
+
return find_best_image(top_content.css('img'), get_base_url(page.parser, page.uri))
|
157
|
+
else
|
158
|
+
return nil
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def valid_image?(width, height, src)
|
163
|
+
if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
|
164
|
+
if (!(width == 728) && !(height == 90))
|
165
|
+
return true
|
166
|
+
end
|
167
|
+
end
|
168
|
+
return false
|
169
|
+
end
|
170
|
+
|
171
|
+
def build_absolute_url(current_src, url)
|
172
|
+
uri = URI.parse(current_src)
|
173
|
+
if uri.relative?
|
174
|
+
current_src = (URI.parse(url.to_s)+current_src).to_s
|
175
|
+
end
|
176
|
+
current_src
|
177
|
+
end
|
178
|
+
|
179
|
+
def find_best_image(all_images, url)
|
180
|
+
begin
|
181
|
+
current_src = nil
|
182
|
+
all_images.each do |img|
|
183
|
+
current_src = img["src"]
|
184
|
+
if valid_image?(img['width'].to_i, img['height'].to_i, current_src)
|
185
|
+
return build_absolute_url(current_src, url)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
all_images.each do |img|
|
189
|
+
current_src = img["src"]
|
190
|
+
current_src = build_absolute_url(current_src, url)
|
191
|
+
open(current_src, "rb") do |fh|
|
192
|
+
is = ImageSize.new(fh.read)
|
193
|
+
if valid_image?(is.width, is.height, current_src)
|
194
|
+
return current_src
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
return nil
|
199
|
+
rescue Errno::ENOENT
|
200
|
+
puts "No such file - " + current_src
|
201
|
+
rescue
|
202
|
+
puts "There was a problem connecting - " + current_src
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "MechanizeContent" do
|
4
|
+
it "initialise mechanize content" do
|
5
|
+
mc = MechanizeContent.new("http://www.google.com")
|
6
|
+
mc.urls.first.should eql("http://www.google.com")
|
7
|
+
end
|
8
|
+
|
9
|
+
it "fetch the best title" do
|
10
|
+
mc = MechanizeContent.new("http://techmeme.com/")
|
11
|
+
mc.best_title.should eql("Techmeme")
|
12
|
+
end
|
13
|
+
|
14
|
+
it "page has incorrect class so only url returned" do
|
15
|
+
mc = MechanizeContent.new("http://techmeme.com/")
|
16
|
+
agent = mock("agent")
|
17
|
+
page = mock("page")
|
18
|
+
page.stub!(:class).and_return(String)
|
19
|
+
agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
|
20
|
+
mc.should_receive(:init_agent).and_return(agent)
|
21
|
+
mc.best_title.should eql("http://techmeme.com/")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "page has no title so only url returned" do
|
25
|
+
mc = MechanizeContent.new("http://techmeme.com/")
|
26
|
+
agent = mock("agent")
|
27
|
+
page = mock("page")
|
28
|
+
page.stub!(:class).and_return(Mechanize::Page)
|
29
|
+
page.stub!(:title).and_return(nil)
|
30
|
+
agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
|
31
|
+
mc.should_receive(:init_agent).and_return(agent)
|
32
|
+
mc.best_title.should eql("http://techmeme.com/")
|
33
|
+
end
|
34
|
+
|
35
|
+
it "page retrival errors" do
|
36
|
+
mc = MechanizeContent.new("http://techmeme.com/")
|
37
|
+
agent = mock("agent")
|
38
|
+
page = mock("page")
|
39
|
+
page.stub!(:class).and_return(Mechanize::Page)
|
40
|
+
agent.should_receive(:get).with("http://techmeme.com/").and_raise(Timeout::Error)
|
41
|
+
agent.should_receive(:get).with("http://somewherelse.com/").and_raise(Errno::ECONNRESET)
|
42
|
+
mc.should_receive(:init_agent).any_number_of_times.and_return(agent)
|
43
|
+
|
44
|
+
mc.fetch_page("http://techmeme.com/").should eql(nil)
|
45
|
+
mc.fetch_page("http://somewherelse.com/").should eql(nil)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "mechanize page issues" do
|
49
|
+
mc = MechanizeContent.new("http://techmeme.com/")
|
50
|
+
agent = mock("agent")
|
51
|
+
page = mock("page")
|
52
|
+
mc.stub!(:init_agent).and_return(agent)
|
53
|
+
page.stub!(:code).and_return(400)
|
54
|
+
agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
|
55
|
+
mc.fetch_page("http://techmeme.com/").should eql(nil)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "fetch some text" do
|
59
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
60
|
+
page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
61
|
+
mc.fetch_text(page).should eql(nil)
|
62
|
+
|
63
|
+
mc2 = MechanizeContent.new("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
|
64
|
+
page = mc2.fetch_page("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
|
65
|
+
mc2.fetch_text(page).should eql("Game Developers Conference organizers have confirmed the final set of independent game-specific content, including Ron Carmel on the just-debuted Indie Fund, the Gamma IV party/showcase, and the EGW-replacing Nuovo Sessions game showcase.The newly confirmed details round off a multitude of independent game-specific content at the March 9th-13th event, held at the Moscone Center in San Francisco, including the 12th Annual Independent Games Festival -- featuring over 30 top indie games playable on the GDC Expo floor from Thursday 11th to Saturday 13th, as well as the major IGF Awards on Thursday 11th at 6.30pm.In addition, the 4th Independent Games Summit on Tuesday 9th and Wednesday 10th has added and clarified a number of sessions, with 2D Boy's Ron Carmel kicking off the event with 'Indies and Publishers: Fixing a System That Never Worked', now confirmed to discuss the new Indie Fund organization.Another major new panel, 'Tripping The Art Fantastic', features Spelunky creator Derek Yu, Braid artist David Hellman and Super Meat Boy co-creator Edmund McMillen discussing \"how each one of these figures influences the state of game art, from hand painted epics to short form experimental Flash games.\"")
|
66
|
+
end
|
67
|
+
|
68
|
+
it "find the best text" do
|
69
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
70
|
+
mc.best_text.should eql(nil)
|
71
|
+
|
72
|
+
mc2 = MechanizeContent.new("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
|
73
|
+
mc2.best_text.should eql("Game Developers Conference organizers have confirmed the final set of independent game-specific content, including Ron Carmel on the just-debuted Indie Fund, the Gamma IV party/showcase, and the EGW-replacing Nuovo Sessions game showcase.The newly confirmed details round off a multitude of independent game-specific content at the March 9th-13th event, held at the Moscone Center in San Francisco, including the 12th Annual Independent Games Festival -- featuring over 30 top indie games playable on the GDC Expo floor from Thursday 11th to Saturday 13th, as well as the major IGF Awards on Thursday 11th at 6.30pm.In addition, the 4th Independent Games Summit on Tuesday 9th and Wednesday 10th has added and clarified a number of sessions, with 2D Boy's Ron Carmel kicking off the event with 'Indies and Publishers: Fixing a System That Never Worked', now confirmed to discuss the new Indie Fund organization.Another major new panel, 'Tripping The Art Fantastic', features Spelunky creator Derek Yu, Braid artist David Hellman and Super Meat Boy co-creator Edmund McMillen discussing \"how each one of these figures influences the state of game art, from hand painted epics to short form experimental Flash games.\"")
|
74
|
+
end
|
75
|
+
|
76
|
+
it "reject all gifs" do
|
77
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
78
|
+
mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/ablank.gif2").should eql(false)
|
79
|
+
end
|
80
|
+
|
81
|
+
it "reject image with banner in the name" do
|
82
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
83
|
+
mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/banner.png").should eql(false)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "reject image that is too small" do
|
87
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
88
|
+
mc.valid_image?(64, 500, "http://www.cmpevents.com/GD10/toosmall.png").should eql(false)
|
89
|
+
end
|
90
|
+
|
91
|
+
it "allow good images" do
|
92
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
93
|
+
mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/perfecto.png").should eql(true)
|
94
|
+
end
|
95
|
+
|
96
|
+
it "build a base url for images" do
|
97
|
+
mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
98
|
+
page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
99
|
+
mc.get_base_url(page.parser, page.uri).to_s.should eql("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
|
100
|
+
|
101
|
+
mc = MechanizeContent.new("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
|
102
|
+
page = mc.fetch_page("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
|
103
|
+
mc.get_base_url(page.parser, page.uri).to_s.should eql("http://www.mutinydesign.co.uk/")
|
104
|
+
end
|
105
|
+
|
106
|
+
it "find image" do
|
107
|
+
mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
|
108
|
+
page = mc.fetch_page("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
|
109
|
+
mc.fetch_image(page).should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
|
110
|
+
|
111
|
+
mc2 = MechanizeContent.new("http://www.joystiq.com/2010/03/18/xbox-360-gaining-usb-storage-support-in-2010-update/")
|
112
|
+
page2 = mc2.fetch_page("http://www.joystiq.com/2010/03/18/xbox-360-gaining-usb-storage-support-in-2010-update/")
|
113
|
+
mc2.fetch_image(page2).should eql("http://www.blogcdn.com/www.joystiq.com/media/2010/03/joystiq-xbox-usb-support-580.jpg")
|
114
|
+
|
115
|
+
mc3 = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
|
116
|
+
page3 = mc3.fetch_page("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
|
117
|
+
mc3.fetch_image(page3).should eql(nil)
|
118
|
+
|
119
|
+
mc4 = MechanizeContent.new("http://www.gog.com/page_has_no_content")
|
120
|
+
page4 = mock("page")
|
121
|
+
mc4.stub!(:fetch_content).with(page4).and_return(nil)
|
122
|
+
mc4.fetch_image(page4).should eql(nil)
|
123
|
+
|
124
|
+
mc5 = MechanizeContent.new("http://www.egmnow.com/press/time-warner-retail-egm.html")
|
125
|
+
page5 = mc5.fetch_page("http://www.egmnow.com/press/time-warner-retail-egm.html")
|
126
|
+
mc5.fetch_image(page5).should eql("http://www.egmnow.com/images/egmlogo.jpg")
|
127
|
+
end
|
128
|
+
|
129
|
+
it "find the best image" do
|
130
|
+
mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
|
131
|
+
mc.best_image.should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
|
132
|
+
|
133
|
+
mc3 = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
|
134
|
+
mc3.best_image.should eql(nil)
|
135
|
+
end
|
136
|
+
|
137
|
+
it "find the best content from multiple urls" do
|
138
|
+
mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april", "http://www.vg247.com/2010/03/18/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3/")
|
139
|
+
mc.best_title.should eql("Rockstar Games | Rockstar News Wire | Episodes from Liberty City Now Coming to PlayStation 3 and PC this April")
|
140
|
+
mc.best_text.should eql("Due to a last minute game submission request from Sony Computer Entertainment Europe to edit some of the in-game Liberty City radio station, television, and internet content – we are forced to delay the worldwide release of Grand Theft Auto: Episodes from Liberty City for both PlayStation 3 and PC for an extra two weeks.\rThe new release date for Episodes from Liberty City - and the two downloadable episodes The Lost and Damned and The Ballad of Gay Tony - on those platforms is now April 13th in North America and April 16th in Europe. This new date will enable us to rectify these changes for Sony Europe, and still allow for a level playing field for all of the Grand Theft Auto fans that have been waiting patiently for this release. In the meantime, we’re moving full speed ahead towards the new game release date. On that note – please be aware that the Grand Theft Auto IV PlayStation 3 leaderboards at Rockstar Games Social Club will be down for maintenance for one week starting March 22nd as we work on their re-launch in support of Episodes from Liberty City.\rBelow are answers to some additional questions that we know some of you may have…\rThose game changes sound pretty minor. Why does the game have to be delayed a whole two weeks?\rUnfortunately, with each round of changes comes fully re-testing the game and a full re-submission to PlayStation. This is the nature of the game submission process. Believe us, if we could expedite the turnaround any sooner – we would. We are dying to get this game in the hands of fans who’ve waited for it for so long in the first place.Why is content being edited just for the European release? This doesn’t seem fair.\rThere are different regional requirements for content – whether dictated by ratings boards like the ESRB and BBFC or by SCEE – this is pretty standard in the world of entertainment.\rIf this content is only being edited for the PlayStation 3 release, and only in Europe… why does everyone in North America etc have to wait? And why do PC players have to wait at all?\rThis was a tough decision but with a simultaneous release, everyone can experience multiplayer simultaneously, take part in online events together, be on level ground on leaderboards, etc. What about those Episodes from Liberty City PSN and GFWL Social Club multiplayer events you announced for April 2nd and 3rd? \rThe first Episodes events for those systems will now be on April 16th and 17th. We will most likely replace the originally scheduled early April events with one for another game. Any requests?\rAny other questions, please feel to leave in the Comments area and we’ll do our best to answer. While this sort of thing may be commonplace in the world of interactive entertainment, we know that game delays are as disappointing to you all as they are to us – and we thank all of our fans immensely for their patience and understanding.\rRockstar Games")
|
141
|
+
mc.best_image.should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
|
142
|
+
|
143
|
+
mc = MechanizeContent.new("http://www.facebook.com/RockBand", "http://www.vg247.com/2010/03/09/rock-band-3-out-this-holiday-will-revolutionize-genre/")
|
144
|
+
mc.best_title.should eql("Rock Band | Facebook")
|
145
|
+
mc.best_text.should eql("Harmonix just confirmed that Rock Band 3 will release this holiday season.Said the firm on Rock Band’s Facebook page:“Harmonix is developing Rock Band 3 for worldwide release this holiday season! The game, which will be published by MTV Games and distributed by Electronic Arts, will innovate and revolutionize the music genre once again, just as Harmonix did with the original Rock Band, Rock Band 2 and The Beatles: Rock Band. Stay tuned for more details!”There’s no more detail right now, but keep watching for updates from GDC.")
|
146
|
+
mc.best_image.should eql("http://assets.vg247.com/current//2010/03/rockbandlogo.jpg")
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mechanize-content
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- John Griffin
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-19 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mechanize
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
version: 1.0.0
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
34
|
+
description: pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content
|
35
|
+
email: johnog@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- LICENSE
|
42
|
+
- README.rdoc
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.rdoc
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- lib/mechanize-content.rb
|
51
|
+
- spec/mechanize-content_spec.rb
|
52
|
+
- spec/spec.opts
|
53
|
+
- spec/spec_helper.rb
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: http://github.com/john-griffin/mechanize-content
|
56
|
+
licenses: []
|
57
|
+
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options:
|
60
|
+
- --charset=UTF-8
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.3.6
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: scrape the best content from a page
|
84
|
+
test_files:
|
85
|
+
- spec/mechanize-content_spec.rb
|
86
|
+
- spec/spec_helper.rb
|