mechanize_content 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rvmrc +1 -0
- data/Rakefile +5 -0
- data/lib/mechanize_content/image.rb +61 -0
- data/lib/mechanize_content/page.rb +115 -0
- data/lib/mechanize_content/util.rb +4 -31
- data/lib/mechanize_content/version.rb +1 -1
- data/lib/mechanize_content.rb +8 -168
- data/mechanize_content.gemspec +2 -1
- data/spec/cassettes/MechanizeContent.yml +33742 -0
- data/spec/cassettes/MechanizeContent_Image.yml +279 -0
- data/spec/mechanize_content/image_spec.rb +39 -0
- data/spec/mechanize_content/page_spec.rb +90 -0
- data/spec/mechanize_content_spec.rb +149 -0
- data/spec/spec_helper.rb +11 -1
- metadata +29 -45
- data/spec/fixtures/a-fistful-of-red-dead-redemption-ps3-for-a-few-dollars-less-on.html +0 -754
- data/spec/fixtures/another-world-15th-anniversary-edition-now-on-gog-com.html +0 -2416
- data/spec/fixtures/another_world_15th_anniversary_edition.html +0 -805
- data/spec/fixtures/cmp.html +0 -333
- data/spec/fixtures/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april.html +0 -1593
- data/spec/fixtures/gdc_2010_rounds_off_indie_cove.html +0 -698
- data/spec/fixtures/google.html +0 -42
- data/spec/fixtures/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3.html +0 -1012
- data/spec/fixtures/johnny.jpg +0 -0
- data/spec/fixtures/joystiq-xbox-usb-support-580.jpg +0 -0
- data/spec/fixtures/mutiny.html +0 -264
- data/spec/fixtures/nuff-said-good-old-games-gets-another-world-168150.html +0 -5492
- data/spec/fixtures/rock-band-3-out-this-holiday-will-revolutionize-genre.html +0 -1157
- data/spec/fixtures/rockband_facebook.html +0 -93
- data/spec/fixtures/spartan.html +0 -391
- data/spec/fixtures/techmeme.html +0 -2216
- data/spec/fixtures/time-warner-retail-egm.html +0 -49
- data/spec/fixtures/witcher.html +0 -458
- data/spec/fixtures/xbox-360-gaining-usb-storage-support-in-2010-update.html +0 -2462
- data/spec/mechanize-content_spec.rb +0 -202
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm 1.9.2@mechanize_content
|
data/Rakefile
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
module MechanizeContent
|
2
|
+
class Image
|
3
|
+
MIN_WIDTH = 64
|
4
|
+
MIN_HEIGHT = 64
|
5
|
+
AD_WIDTH = 728
|
6
|
+
AD_HEIGHT = 90
|
7
|
+
|
8
|
+
def self.best_image(images, base_url)
|
9
|
+
imgs = images.map{|i| Image.new(i, base_url)}
|
10
|
+
top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
|
11
|
+
top_image.absolute_url if top_image
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(image, base_url)
|
15
|
+
@src = image["src"]
|
16
|
+
@width = image["width"].to_i
|
17
|
+
@height = image["height"].to_i
|
18
|
+
@base_url = base_url
|
19
|
+
end
|
20
|
+
|
21
|
+
def interesting_css?
|
22
|
+
valid_image?(@width, @height)
|
23
|
+
end
|
24
|
+
|
25
|
+
def interesting_file?
|
26
|
+
open(absolute_url, "rb") do |fh|
|
27
|
+
is = ImageSize.new(fh.read)
|
28
|
+
return valid_image?(is.width, is.height)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def valid_image?(width, height)
|
33
|
+
big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
|
34
|
+
end
|
35
|
+
|
36
|
+
def allows_hotlinking?
|
37
|
+
begin
|
38
|
+
open(absolute_url, "Referer" => "http://splitstate.com")
|
39
|
+
rescue OpenURI::HTTPError, SocketError
|
40
|
+
return false
|
41
|
+
end
|
42
|
+
true
|
43
|
+
end
|
44
|
+
|
45
|
+
def advertising?(width, height)
|
46
|
+
@src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
|
47
|
+
end
|
48
|
+
|
49
|
+
def not_advertising?(width, height)
|
50
|
+
!advertising?(width, height)
|
51
|
+
end
|
52
|
+
|
53
|
+
def big_enough?(width, height)
|
54
|
+
width > MIN_WIDTH && height > MIN_HEIGHT
|
55
|
+
end
|
56
|
+
|
57
|
+
def absolute_url
|
58
|
+
URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
module MechanizeContent
|
2
|
+
class Page
|
3
|
+
attr_accessor :url
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
end
|
8
|
+
|
9
|
+
def title
|
10
|
+
content.title if content
|
11
|
+
end
|
12
|
+
|
13
|
+
def text
|
14
|
+
Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
|
15
|
+
end
|
16
|
+
|
17
|
+
def image
|
18
|
+
@image ||= best_content ? Image.best_image(images, base_url) : nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def images
|
22
|
+
best_content.css('img')
|
23
|
+
end
|
24
|
+
|
25
|
+
def base_url
|
26
|
+
base = content.parser.xpath("//base/@href").first
|
27
|
+
base ? base.value : content.uri
|
28
|
+
end
|
29
|
+
|
30
|
+
def best_content
|
31
|
+
@best_content ||= find_content
|
32
|
+
end
|
33
|
+
|
34
|
+
def find_content
|
35
|
+
return nil unless content
|
36
|
+
doc = content.parser
|
37
|
+
readability = {}
|
38
|
+
doc.css('p').each do |paragraph|
|
39
|
+
if readability[paragraph.parent].nil?
|
40
|
+
readability[paragraph.parent] = 0
|
41
|
+
end
|
42
|
+
parent_class = paragraph.parent['class'] || ""
|
43
|
+
parent_id = paragraph.parent['id'] || ""
|
44
|
+
if !parent_class.match('(comment|meta|footer|footnote)').nil?
|
45
|
+
readability[paragraph.parent] -= 50
|
46
|
+
elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
47
|
+
readability[paragraph.parent] += 25
|
48
|
+
end
|
49
|
+
|
50
|
+
if !parent_id.match('(comment|meta|footer|footnote)').nil?
|
51
|
+
readability[paragraph.parent] -= 50
|
52
|
+
elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
53
|
+
readability[paragraph.parent] += 25
|
54
|
+
end
|
55
|
+
|
56
|
+
if paragraph.inner_text().length > 10
|
57
|
+
readability[paragraph.parent] += 1
|
58
|
+
end
|
59
|
+
if !paragraph.parent.attributes.values.nil?
|
60
|
+
if !paragraph.parent.attributes.values.first.nil?
|
61
|
+
if paragraph.parent.attributes.values.first.value.include? "comment"
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
67
|
+
end
|
68
|
+
sorted_results = readability.sort_by { |parent,score| -score }
|
69
|
+
if sorted_results.nil? || sorted_results.first.nil?
|
70
|
+
return nil
|
71
|
+
elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
|
72
|
+
return nil
|
73
|
+
else
|
74
|
+
top_result = sorted_results.first.first
|
75
|
+
top_result.css('script').unlink
|
76
|
+
top_result.css('iframe').unlink
|
77
|
+
top_result.css('h1').unlink
|
78
|
+
top_result.css('h2').unlink
|
79
|
+
top_result.css("div#date-byline").unlink
|
80
|
+
top_result.css("p.date").unlink
|
81
|
+
top_result.css("div#facebook-like-button").unlink
|
82
|
+
return top_result
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def content
|
88
|
+
@page_content ||= fetch_content
|
89
|
+
end
|
90
|
+
|
91
|
+
def fetch_content
|
92
|
+
begin
|
93
|
+
page_content = agent.get(@url)
|
94
|
+
page_content if page_content.is_a?(Mechanize::Page)
|
95
|
+
rescue Timeout::Error
|
96
|
+
puts "Timeout - "+@url
|
97
|
+
rescue Errno::ECONNRESET
|
98
|
+
puts "Connection reset by peer - "+@url
|
99
|
+
rescue Mechanize::ResponseCodeError
|
100
|
+
puts "Invalid url"
|
101
|
+
rescue Mechanize::UnsupportedSchemeError
|
102
|
+
puts "Unsupported Scheme"
|
103
|
+
rescue SocketError => e
|
104
|
+
puts e
|
105
|
+
# rescue
|
106
|
+
# puts "There was a problem connecting - "+@url
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def agent
|
111
|
+
@agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
@@ -1,35 +1,8 @@
|
|
1
1
|
module MechanizeContent
|
2
|
-
class Util
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
AD_WIDTH = 728
|
7
|
-
AD_HEIGHT = 90
|
8
|
-
|
9
|
-
def self.get_base_url(doc, url)
|
10
|
-
base_url = doc.xpath("//base/@href").first
|
11
|
-
if base_url.nil?
|
12
|
-
return url
|
13
|
-
else
|
14
|
-
return base_url.value
|
15
|
-
end
|
2
|
+
class Util
|
3
|
+
def self.force_utf8(string)
|
4
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
5
|
+
ic.iconv(string.delete("\t").delete("\n").strip + ' ')[0..-2]
|
16
6
|
end
|
17
|
-
|
18
|
-
def self.build_absolute_url(current_src, url)
|
19
|
-
if URI.parse(current_src).relative?
|
20
|
-
current_src = (URI.parse(url.to_s)+current_src).to_s
|
21
|
-
end
|
22
|
-
current_src
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.valid_image?(width, height, src)
|
26
|
-
if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
|
27
|
-
if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
|
28
|
-
return true
|
29
|
-
end
|
30
|
-
end
|
31
|
-
return false
|
32
|
-
end
|
33
|
-
|
34
7
|
end
|
35
8
|
end
|
data/lib/mechanize_content.rb
CHANGED
@@ -3,187 +3,27 @@ require 'mechanize'
|
|
3
3
|
require 'image_size'
|
4
4
|
require 'open-uri'
|
5
5
|
require 'mechanize_content/util'
|
6
|
+
require 'mechanize_content/page'
|
7
|
+
require 'mechanize_content/image'
|
6
8
|
|
7
9
|
module MechanizeContent
|
8
10
|
class Parser
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
attr_accessor :pages
|
12
|
+
|
12
13
|
def initialize(*args)
|
13
|
-
@
|
14
|
+
@pages = *args.flatten.map{|url| Page.new(url)}
|
14
15
|
end
|
15
16
|
|
16
17
|
def best_title
|
17
|
-
@
|
18
|
+
@pages.map{|page| page.title}.compact.first || @pages.first.url
|
18
19
|
end
|
19
20
|
|
20
21
|
def best_text
|
21
|
-
@
|
22
|
+
@pages.map{|page| page.text}.compact.first
|
22
23
|
end
|
23
24
|
|
24
25
|
def best_image
|
25
|
-
@
|
26
|
-
end
|
27
|
-
|
28
|
-
def fetch_images
|
29
|
-
(@pages || fetch_pages).each do |page|
|
30
|
-
image = fetch_image(page)
|
31
|
-
return image unless image.nil?
|
32
|
-
end
|
33
|
-
return nil
|
34
|
-
end
|
35
|
-
|
36
|
-
def fetch_texts
|
37
|
-
(@pages || fetch_pages).each do |page|
|
38
|
-
text = fetch_text(page)
|
39
|
-
return text unless text.nil? || text.empty?
|
40
|
-
end
|
41
|
-
return nil
|
42
|
-
end
|
43
|
-
|
44
|
-
def fetch_titles
|
45
|
-
(@pages || fetch_pages).each do |page|
|
46
|
-
title = page.title
|
47
|
-
unless title.nil?
|
48
|
-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
49
|
-
title = ic.iconv(title + ' ')[0..-2]
|
50
|
-
return title
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
return @urls.first
|
55
|
-
end
|
56
|
-
|
57
|
-
def fetch_pages
|
58
|
-
@pages = []
|
59
|
-
@urls.each do |url|
|
60
|
-
page = fetch_page(url)
|
61
|
-
@pages << page unless page.nil?
|
62
|
-
end
|
63
|
-
@pages
|
64
|
-
end
|
65
|
-
|
66
|
-
def fetch_page(url)
|
67
|
-
begin
|
68
|
-
page = (@agent || init_agent).get(url)
|
69
|
-
if page.class == Mechanize::Page
|
70
|
-
return page
|
71
|
-
else
|
72
|
-
return nil
|
73
|
-
end
|
74
|
-
rescue Timeout::Error
|
75
|
-
puts "Timeout - "+url
|
76
|
-
rescue Errno::ECONNRESET
|
77
|
-
puts "Connection reset by peer - "+url
|
78
|
-
rescue Mechanize::ResponseCodeError
|
79
|
-
puts "Invalid url"
|
80
|
-
rescue Mechanize::UnsupportedSchemeError
|
81
|
-
puts "Unsupported Scheme"
|
82
|
-
rescue
|
83
|
-
puts "There was a problem connecting - "+url
|
84
|
-
end
|
26
|
+
@pages.map{|page| page.image}.compact.first
|
85
27
|
end
|
86
|
-
|
87
|
-
def init_agent
|
88
|
-
agent = Mechanize.new
|
89
|
-
agent.user_agent_alias = 'Mac Safari'
|
90
|
-
return @agent = agent
|
91
|
-
end
|
92
|
-
|
93
|
-
def fetch_text(page)
|
94
|
-
top_content = fetch_content(page)
|
95
|
-
if top_content
|
96
|
-
text = top_content.text.delete("\t").delete("\n").strip
|
97
|
-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
98
|
-
text = ic.iconv(text + ' ')[0..-2]
|
99
|
-
else
|
100
|
-
return nil
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
def fetch_content(page)
|
105
|
-
doc = page.parser
|
106
|
-
readability = {}
|
107
|
-
doc.css('p').each do |paragraph|
|
108
|
-
if readability[paragraph.parent].nil?
|
109
|
-
readability[paragraph.parent] = 0
|
110
|
-
end
|
111
|
-
parent_class = paragraph.parent['class'] || ""
|
112
|
-
parent_id = paragraph.parent['id'] || ""
|
113
|
-
if !parent_class.match('(comment|meta|footer|footnote)').nil?
|
114
|
-
readability[paragraph.parent] -= 50
|
115
|
-
elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
116
|
-
readability[paragraph.parent] += 25
|
117
|
-
end
|
118
|
-
|
119
|
-
if !parent_id.match('(comment|meta|footer|footnote)').nil?
|
120
|
-
readability[paragraph.parent] -= 50
|
121
|
-
elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
122
|
-
readability[paragraph.parent] += 25
|
123
|
-
end
|
124
|
-
|
125
|
-
if paragraph.inner_text().length > 10
|
126
|
-
readability[paragraph.parent] += 1
|
127
|
-
end
|
128
|
-
if !paragraph.parent.attributes.values.nil?
|
129
|
-
if !paragraph.parent.attributes.values.first.nil?
|
130
|
-
if paragraph.parent.attributes.values.first.value.include? "comment"
|
131
|
-
break
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
|
-
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
136
|
-
end
|
137
|
-
sorted_results = readability.sort_by { |parent,score| -score }
|
138
|
-
if sorted_results.nil? || sorted_results.first.nil?
|
139
|
-
return nil
|
140
|
-
elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
|
141
|
-
return nil
|
142
|
-
else
|
143
|
-
top_result = sorted_results.first.first
|
144
|
-
top_result.css('script').unlink
|
145
|
-
top_result.css('iframe').unlink
|
146
|
-
top_result.css('h1').unlink
|
147
|
-
top_result.css('h2').unlink
|
148
|
-
return top_result
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
def fetch_image(page)
|
153
|
-
top_content = fetch_content(page)
|
154
|
-
if top_content
|
155
|
-
return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
|
156
|
-
else
|
157
|
-
return nil
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
def find_best_image(all_images, url)
|
162
|
-
begin
|
163
|
-
current_src = nil
|
164
|
-
all_images.each do |img|
|
165
|
-
current_src = img["src"]
|
166
|
-
if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
|
167
|
-
return Util.build_absolute_url(current_src, url)
|
168
|
-
end
|
169
|
-
end
|
170
|
-
all_images.each do |img|
|
171
|
-
current_src = img["src"]
|
172
|
-
current_src = Util.build_absolute_url(current_src, url)
|
173
|
-
open(current_src, "rb") do |fh|
|
174
|
-
is = ImageSize.new(fh.read)
|
175
|
-
if Util.valid_image?(is.width, is.height, current_src)
|
176
|
-
return current_src
|
177
|
-
end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
return nil
|
181
|
-
rescue Errno::ENOENT
|
182
|
-
puts "No such file - " + current_src
|
183
|
-
rescue
|
184
|
-
puts "There was a problem connecting - " + current_src
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
28
|
end
|
189
29
|
end
|
data/mechanize_content.gemspec
CHANGED
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
s.add_dependency("mechanize", "~> 1.0.0")
|
22
22
|
s.add_dependency("imagesize", "~> 0.1.1")
|
23
|
-
s.add_development_dependency('rspec', "~> 2.
|
23
|
+
s.add_development_dependency('rspec', "~> 2.6.0")
|
24
|
+
s.add_development_dependency('vcr', "~> 1.9.0")
|
24
25
|
s.add_development_dependency('fakeweb', "~> 1.3.0")
|
25
26
|
end
|