mechanize-content 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 John Griffin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ = mechanize-content
2
+
3
+ pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 John Griffin. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "mechanize-content"
8
+ gem.summary = %Q{scrape the best content from a page}
9
+ gem.description = %Q{pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content}
10
+ gem.email = "johnog@gmail.com"
11
+ gem.homepage = "http://github.com/john-griffin/mechanize-content"
12
+ gem.authors = ["John Griffin"]
13
+ gem.add_dependency('mechanize', '>= 1.0.0')
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'spec/rake/spectask'
21
+ Spec::Rake::SpecTask.new(:spec) do |spec|
22
+ spec.libs << 'lib' << 'spec'
23
+ spec.spec_files = FileList['spec/**/*_spec.rb']
24
+ spec.spec_opts = ["--debugger"]
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "mechanize-content #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,206 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'image_size'
4
+ require 'open-uri'
5
+
6
+ class MechanizeContent
7
+
8
+ attr_accessor :urls
9
+
10
+ MIN_WIDTH = 64
11
+ MIN_HEIGHT = 64
12
+
13
+ def initialize(*args)
14
+ @urls = *args
15
+ end
16
+
17
+ def best_title
18
+ @best_title || fetch_titles
19
+ end
20
+
21
+ def best_text
22
+ @best_text || fetch_texts
23
+ end
24
+
25
+ def best_image
26
+ @best_image || fetch_images
27
+ end
28
+
29
+ def fetch_images
30
+ (@pages || fetch_pages).each do |page|
31
+ image = fetch_image(page)
32
+ return @best_image = image unless image.nil?
33
+ end
34
+ return nil
35
+ end
36
+
37
+ def fetch_texts
38
+ (@pages || fetch_pages).each do |page|
39
+ text = fetch_text(page)
40
+ return @best_text = text unless text.nil?
41
+ end
42
+ return nil
43
+ end
44
+
45
+ def fetch_titles
46
+ (@pages || fetch_pages).each do |page|
47
+ title = page.title
48
+ unless title.nil?
49
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
50
+ title = ic.iconv(title + ' ')[0..-2]
51
+ return @best_title = title
52
+ end
53
+
54
+ end
55
+ return @urls.first
56
+ end
57
+
58
+ def fetch_pages
59
+ @pages = []
60
+ @urls.each do |url|
61
+ page = fetch_page(url)
62
+ @pages << page unless page.nil?
63
+ end
64
+ @pages
65
+ end
66
+
67
+ def fetch_page(url)
68
+ begin
69
+ page = (@agent || init_agent).get(url)
70
+ if page.class == Mechanize::Page
71
+ return page
72
+ else
73
+ return nil
74
+ end
75
+ rescue Timeout::Error
76
+ puts "Timeout - "+url
77
+ rescue Errno::ECONNRESET
78
+ puts "Connection reset by peer - "+url
79
+ rescue Mechanize::ResponseCodeError
80
+ puts "Invalid url"
81
+ rescue Mechanize::UnsupportedSchemeError
82
+ puts "Unsupported Scheme"
83
+ rescue
84
+ puts "There was a problem connecting - "+url
85
+ end
86
+ end
87
+
88
+ def init_agent
89
+ agent = Mechanize.new
90
+ agent.user_agent_alias = 'Mac Safari'
91
+ return @agent = agent
92
+ end
93
+
94
+ def fetch_text(page)
95
+ top_content = fetch_content(page)
96
+ if top_content
97
+ text = top_content.text.delete("\t").delete("\n").strip
98
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
99
+ text = ic.iconv(text + ' ')[0..-2]
100
+ else
101
+ return nil
102
+ end
103
+ end
104
+
105
+ def fetch_content(page)
106
+ doc = page.parser
107
+ readability = {}
108
+ doc.css('p').each do |paragraph|
109
+ if readability[paragraph.parent].nil?
110
+ readability[paragraph.parent] = 0
111
+ end
112
+ parent_class = paragraph.parent['class'] || ""
113
+ parent_id = paragraph.parent['id'] || ""
114
+ if !parent_class.match('(comment|meta|footer|footnote)').nil?
115
+ readability[paragraph.parent] -= 50
116
+ elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
117
+ readability[paragraph.parent] += 25
118
+ end
119
+
120
+ if !parent_id.match('(comment|meta|footer|footnote)').nil?
121
+ readability[paragraph.parent] -= 50
122
+ elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
123
+ readability[paragraph.parent] += 25
124
+ end
125
+
126
+ if paragraph.inner_text().length > 10
127
+ readability[paragraph.parent] += 1
128
+ end
129
+ readability[paragraph.parent] += paragraph.inner_text().count(',')
130
+ end
131
+ sorted_results = readability.sort_by { |parent,score| -score }
132
+ if sorted_results.nil? || sorted_results.first.nil?
133
+ return nil
134
+ else
135
+ top_result = sorted_results.first.first
136
+ top_result.css('script').unlink
137
+ top_result.css('iframe').unlink
138
+ top_result.css('h1').unlink
139
+ top_result.css('h2').unlink
140
+ return top_result
141
+ end
142
+ end
143
+
144
+ def get_base_url(doc, url)
145
+ base_url = doc.xpath("//base/@href").first
146
+ if base_url.nil?
147
+ return url
148
+ else
149
+ return base_url.value
150
+ end
151
+ end
152
+
153
+ def fetch_image(page)
154
+ top_content = fetch_content(page)
155
+ if top_content
156
+ return find_best_image(top_content.css('img'), get_base_url(page.parser, page.uri))
157
+ else
158
+ return nil
159
+ end
160
+ end
161
+
162
+ def valid_image?(width, height, src)
163
+ if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
164
+ if (!(width == 728) && !(height == 90))
165
+ return true
166
+ end
167
+ end
168
+ return false
169
+ end
170
+
171
+ def build_absolute_url(current_src, url)
172
+ uri = URI.parse(current_src)
173
+ if uri.relative?
174
+ current_src = (URI.parse(url.to_s)+current_src).to_s
175
+ end
176
+ current_src
177
+ end
178
+
179
+ def find_best_image(all_images, url)
180
+ begin
181
+ current_src = nil
182
+ all_images.each do |img|
183
+ current_src = img["src"]
184
+ if valid_image?(img['width'].to_i, img['height'].to_i, current_src)
185
+ return build_absolute_url(current_src, url)
186
+ end
187
+ end
188
+ all_images.each do |img|
189
+ current_src = img["src"]
190
+ current_src = build_absolute_url(current_src, url)
191
+ open(current_src, "rb") do |fh|
192
+ is = ImageSize.new(fh.read)
193
+ if valid_image?(is.width, is.height, current_src)
194
+ return current_src
195
+ end
196
+ end
197
+ end
198
+ return nil
199
+ rescue Errno::ENOENT
200
+ puts "No such file - " + current_src
201
+ rescue
202
+ puts "There was a problem connecting - " + current_src
203
+ end
204
+ end
205
+
206
+ end
@@ -0,0 +1,149 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "MechanizeContent" do
4
+ it "initialise mechanize content" do
5
+ mc = MechanizeContent.new("http://www.google.com")
6
+ mc.urls.first.should eql("http://www.google.com")
7
+ end
8
+
9
+ it "fetch the best title" do
10
+ mc = MechanizeContent.new("http://techmeme.com/")
11
+ mc.best_title.should eql("Techmeme")
12
+ end
13
+
14
+ it "page has incorrect class so only url returned" do
15
+ mc = MechanizeContent.new("http://techmeme.com/")
16
+ agent = mock("agent")
17
+ page = mock("page")
18
+ page.stub!(:class).and_return(String)
19
+ agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
20
+ mc.should_receive(:init_agent).and_return(agent)
21
+ mc.best_title.should eql("http://techmeme.com/")
22
+ end
23
+
24
+ it "page has no title so only url returned" do
25
+ mc = MechanizeContent.new("http://techmeme.com/")
26
+ agent = mock("agent")
27
+ page = mock("page")
28
+ page.stub!(:class).and_return(Mechanize::Page)
29
+ page.stub!(:title).and_return(nil)
30
+ agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
31
+ mc.should_receive(:init_agent).and_return(agent)
32
+ mc.best_title.should eql("http://techmeme.com/")
33
+ end
34
+
35
+ it "page retrival errors" do
36
+ mc = MechanizeContent.new("http://techmeme.com/")
37
+ agent = mock("agent")
38
+ page = mock("page")
39
+ page.stub!(:class).and_return(Mechanize::Page)
40
+ agent.should_receive(:get).with("http://techmeme.com/").and_raise(Timeout::Error)
41
+ agent.should_receive(:get).with("http://somewherelse.com/").and_raise(Errno::ECONNRESET)
42
+ mc.should_receive(:init_agent).any_number_of_times.and_return(agent)
43
+
44
+ mc.fetch_page("http://techmeme.com/").should eql(nil)
45
+ mc.fetch_page("http://somewherelse.com/").should eql(nil)
46
+ end
47
+
48
+ it "mechanize page issues" do
49
+ mc = MechanizeContent.new("http://techmeme.com/")
50
+ agent = mock("agent")
51
+ page = mock("page")
52
+ mc.stub!(:init_agent).and_return(agent)
53
+ page.stub!(:code).and_return(400)
54
+ agent.should_receive(:get).with("http://techmeme.com/").and_return(page)
55
+ mc.fetch_page("http://techmeme.com/").should eql(nil)
56
+ end
57
+
58
+ it "fetch some text" do
59
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
60
+ page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
61
+ mc.fetch_text(page).should eql(nil)
62
+
63
+ mc2 = MechanizeContent.new("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
64
+ page = mc2.fetch_page("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
65
+ mc2.fetch_text(page).should eql("Game Developers Conference organizers have confirmed the final set of independent game-specific content, including Ron Carmel on the just-debuted Indie Fund, the Gamma IV party/showcase, and the EGW-replacing Nuovo Sessions game showcase.The newly confirmed details round off a multitude of independent game-specific content at the March 9th-13th event, held at the Moscone Center in San Francisco, including the 12th Annual Independent Games Festival -- featuring over 30 top indie games playable on the GDC Expo floor from Thursday 11th to Saturday 13th, as well as the major IGF Awards on Thursday 11th at 6.30pm.In addition, the 4th Independent Games Summit on Tuesday 9th and Wednesday 10th has added and clarified a number of sessions, with 2D Boy's Ron Carmel kicking off the event with 'Indies and Publishers: Fixing a System That Never Worked', now confirmed to discuss the new Indie Fund organization.Another major new panel, 'Tripping The Art Fantastic', features Spelunky creator Derek Yu, Braid artist David Hellman and Super Meat Boy co-creator Edmund McMillen discussing \"how each one of these figures influences the state of game art, from hand painted epics to short form experimental Flash games.\"")
66
+ end
67
+
68
+ it "find the best text" do
69
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
70
+ mc.best_text.should eql(nil)
71
+
72
+ mc2 = MechanizeContent.new("http://www.gamesetwatch.com/2010/03/gdc_2010_rounds_off_indie_cove.php")
73
+ mc2.best_text.should eql("Game Developers Conference organizers have confirmed the final set of independent game-specific content, including Ron Carmel on the just-debuted Indie Fund, the Gamma IV party/showcase, and the EGW-replacing Nuovo Sessions game showcase.The newly confirmed details round off a multitude of independent game-specific content at the March 9th-13th event, held at the Moscone Center in San Francisco, including the 12th Annual Independent Games Festival -- featuring over 30 top indie games playable on the GDC Expo floor from Thursday 11th to Saturday 13th, as well as the major IGF Awards on Thursday 11th at 6.30pm.In addition, the 4th Independent Games Summit on Tuesday 9th and Wednesday 10th has added and clarified a number of sessions, with 2D Boy's Ron Carmel kicking off the event with 'Indies and Publishers: Fixing a System That Never Worked', now confirmed to discuss the new Indie Fund organization.Another major new panel, 'Tripping The Art Fantastic', features Spelunky creator Derek Yu, Braid artist David Hellman and Super Meat Boy co-creator Edmund McMillen discussing \"how each one of these figures influences the state of game art, from hand painted epics to short form experimental Flash games.\"")
74
+ end
75
+
76
+ it "reject all gifs" do
77
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
78
+ mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/ablank.gif2").should eql(false)
79
+ end
80
+
81
+ it "reject image with banner in the name" do
82
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
83
+ mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/banner.png").should eql(false)
84
+ end
85
+
86
+ it "reject image that is too small" do
87
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
88
+ mc.valid_image?(64, 500, "http://www.cmpevents.com/GD10/toosmall.png").should eql(false)
89
+ end
90
+
91
+ it "allow good images" do
92
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
93
+ mc.valid_image?(500, 500, "http://www.cmpevents.com/GD10/perfecto.png").should eql(true)
94
+ end
95
+
96
+ it "build a base url for images" do
97
+ mc = MechanizeContent.new("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
98
+ page = mc.fetch_page("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
99
+ mc.get_base_url(page.parser, page.uri).to_s.should eql("https://www.cmpevents.com/GD10/a.asp?option=C&V=11&SessID=10601")
100
+
101
+ mc = MechanizeContent.new("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
102
+ page = mc.fetch_page("http://www.mutinydesign.co.uk/scripts/html-base-tag---1/")
103
+ mc.get_base_url(page.parser, page.uri).to_s.should eql("http://www.mutinydesign.co.uk/")
104
+ end
105
+
106
+ it "find image" do
107
+ mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
108
+ page = mc.fetch_page("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
109
+ mc.fetch_image(page).should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
110
+
111
+ mc2 = MechanizeContent.new("http://www.joystiq.com/2010/03/18/xbox-360-gaining-usb-storage-support-in-2010-update/")
112
+ page2 = mc2.fetch_page("http://www.joystiq.com/2010/03/18/xbox-360-gaining-usb-storage-support-in-2010-update/")
113
+ mc2.fetch_image(page2).should eql("http://www.blogcdn.com/www.joystiq.com/media/2010/03/joystiq-xbox-usb-support-580.jpg")
114
+
115
+ mc3 = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
116
+ page3 = mc3.fetch_page("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
117
+ mc3.fetch_image(page3).should eql(nil)
118
+
119
+ mc4 = MechanizeContent.new("http://www.gog.com/page_has_no_content")
120
+ page4 = mock("page")
121
+ mc4.stub!(:fetch_content).with(page4).and_return(nil)
122
+ mc4.fetch_image(page4).should eql(nil)
123
+
124
+ mc5 = MechanizeContent.new("http://www.egmnow.com/press/time-warner-retail-egm.html")
125
+ page5 = mc5.fetch_page("http://www.egmnow.com/press/time-warner-retail-egm.html")
126
+ mc5.fetch_image(page5).should eql("http://www.egmnow.com/images/egmlogo.jpg")
127
+ end
128
+
129
+ it "find the best image" do
130
+ mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april")
131
+ mc.best_image.should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
132
+
133
+ mc3 = MechanizeContent.new("http://www.gog.com/en/gamecard/another_world_15th_anniversary_edition")
134
+ mc3.best_image.should eql(nil)
135
+ end
136
+
137
+ it "find the best content from multiple urls" do
138
+ mc = MechanizeContent.new("http://www.rockstargames.com/newswire/2010/03/18/4061/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april", "http://www.vg247.com/2010/03/18/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3/")
139
+ mc.best_title.should eql("Rockstar Games | Rockstar News Wire | Episodes from Liberty City Now Coming to PlayStation 3 and PC this April")
140
+ mc.best_text.should eql("Due to a last minute game submission request from Sony Computer Entertainment Europe to edit some of the in-game Liberty City radio station, television, and internet content – we are forced to delay the worldwide release of Grand Theft Auto: Episodes from Liberty City for both PlayStation 3 and PC for an extra two weeks.\rThe new release date for Episodes from Liberty City - and the two downloadable episodes The Lost and Damned and The Ballad of Gay Tony - on those platforms is now April 13th in North America and April 16th in Europe.  This new date will enable us to rectify these changes for Sony Europe, and still allow for a level playing field for all of the Grand Theft Auto fans that have been waiting patiently for this release.  In the meantime, we’re moving full speed ahead towards the new game release date.  On that note – please be aware that the Grand Theft Auto IV PlayStation 3 leaderboards at Rockstar Games Social Club will be down for maintenance for one week starting March 22nd as we work on their re-launch in support of Episodes from Liberty City.\rBelow are answers to some additional questions that we know some of you may have…\rThose game changes sound pretty minor.  Why does the game have to be delayed a whole two weeks?\rUnfortunately, with each round of changes comes fully re-testing the game and a full re-submission to PlayStation.  This is the nature of the game submission process.  Believe us, if we could expedite the turnaround any sooner – we would.  We are dying to get this game in the hands of fans who’ve waited for it for so long in the first place.Why is content being edited just for the European release?  This doesn’t seem fair.\rThere are different regional requirements for content – whether dictated by ratings boards like the ESRB and BBFC or by SCEE – this is pretty standard in the world of entertainment.\rIf this content is only being edited for the PlayStation 3 release, and only in Europe… why does everyone in North America etc have to wait?  And why do PC players have to wait at all?\rThis was a tough decision but with a simultaneous release, everyone can experience multiplayer simultaneously, take part in online events together, be on level ground on leaderboards, etc. What about those Episodes from Liberty City PSN and GFWL Social Club multiplayer events you announced for April 2nd and 3rd?  \rThe first Episodes events for those systems will now be on April 16th and 17th.  We will most likely replace the originally scheduled early April events with one for another game.  Any requests?\rAny other questions, please feel to leave in the Comments area and we’ll do our best to answer.  While this sort of thing may be commonplace in the world of interactive entertainment, we know that game delays are as disappointing to you all as they are to us – and we thank all of our fans immensely for their patience and understanding.\rRockstar Games")
141
+ mc.best_image.should eql("http://www.rockstargames.com/rockstar/local_data/US/img/news/eflc_luisjohnny.jpg")
142
+
143
+ mc = MechanizeContent.new("http://www.facebook.com/RockBand", "http://www.vg247.com/2010/03/09/rock-band-3-out-this-holiday-will-revolutionize-genre/")
144
+ mc.best_title.should eql("Rock Band | Facebook")
145
+ mc.best_text.should eql("Harmonix just confirmed that Rock Band 3 will release this holiday season.Said the firm on Rock Band’s Facebook page:“Harmonix is developing Rock Band 3 for worldwide release this holiday season! The game, which will be published by MTV Games and distributed by Electronic Arts, will innovate and revolutionize the music genre once again, just as Harmonix did with the original Rock Band, Rock Band 2 and The Beatles: Rock Band. Stay tuned for more details!”There’s no more detail right now, but keep watching for updates from GDC.")
146
+ mc.best_image.should eql("http://assets.vg247.com/current//2010/03/rockbandlogo.jpg")
147
+ end
148
+
149
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'mechanize-content'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mechanize-content
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - John Griffin
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-19 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: mechanize
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 0
30
+ - 0
31
+ version: 1.0.0
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ description: pass in a url or urls and mechanize-content will select the best block of text, image and title by analysing the page content
35
+ email: johnog@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - LICENSE
42
+ - README.rdoc
43
+ files:
44
+ - .document
45
+ - .gitignore
46
+ - LICENSE
47
+ - README.rdoc
48
+ - Rakefile
49
+ - VERSION
50
+ - lib/mechanize-content.rb
51
+ - spec/mechanize-content_spec.rb
52
+ - spec/spec.opts
53
+ - spec/spec_helper.rb
54
+ has_rdoc: true
55
+ homepage: http://github.com/john-griffin/mechanize-content
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --charset=UTF-8
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ segments:
75
+ - 0
76
+ version: "0"
77
+ requirements: []
78
+
79
+ rubyforge_project:
80
+ rubygems_version: 1.3.6
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: scrape the best content from a page
84
+ test_files:
85
+ - spec/mechanize-content_spec.rb
86
+ - spec/spec_helper.rb