mechanize_content 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -0
- data/Rakefile +5 -0
- data/lib/mechanize_content/image.rb +61 -0
- data/lib/mechanize_content/page.rb +115 -0
- data/lib/mechanize_content/util.rb +4 -31
- data/lib/mechanize_content/version.rb +1 -1
- data/lib/mechanize_content.rb +8 -168
- data/mechanize_content.gemspec +2 -1
- data/spec/cassettes/MechanizeContent.yml +33742 -0
- data/spec/cassettes/MechanizeContent_Image.yml +279 -0
- data/spec/mechanize_content/image_spec.rb +39 -0
- data/spec/mechanize_content/page_spec.rb +90 -0
- data/spec/mechanize_content_spec.rb +149 -0
- data/spec/spec_helper.rb +11 -1
- metadata +29 -45
- data/spec/fixtures/a-fistful-of-red-dead-redemption-ps3-for-a-few-dollars-less-on.html +0 -754
- data/spec/fixtures/another-world-15th-anniversary-edition-now-on-gog-com.html +0 -2416
- data/spec/fixtures/another_world_15th_anniversary_edition.html +0 -805
- data/spec/fixtures/cmp.html +0 -333
- data/spec/fixtures/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april.html +0 -1593
- data/spec/fixtures/gdc_2010_rounds_off_indie_cove.html +0 -698
- data/spec/fixtures/google.html +0 -42
- data/spec/fixtures/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3.html +0 -1012
- data/spec/fixtures/johnny.jpg +0 -0
- data/spec/fixtures/joystiq-xbox-usb-support-580.jpg +0 -0
- data/spec/fixtures/mutiny.html +0 -264
- data/spec/fixtures/nuff-said-good-old-games-gets-another-world-168150.html +0 -5492
- data/spec/fixtures/rock-band-3-out-this-holiday-will-revolutionize-genre.html +0 -1157
- data/spec/fixtures/rockband_facebook.html +0 -93
- data/spec/fixtures/spartan.html +0 -391
- data/spec/fixtures/techmeme.html +0 -2216
- data/spec/fixtures/time-warner-retail-egm.html +0 -49
- data/spec/fixtures/witcher.html +0 -458
- data/spec/fixtures/xbox-360-gaining-usb-storage-support-in-2010-update.html +0 -2462
- data/spec/mechanize-content_spec.rb +0 -202
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm 1.9.2@mechanize_content
|
data/Rakefile
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
module MechanizeContent
|
2
|
+
class Image
|
3
|
+
MIN_WIDTH = 64
|
4
|
+
MIN_HEIGHT = 64
|
5
|
+
AD_WIDTH = 728
|
6
|
+
AD_HEIGHT = 90
|
7
|
+
|
8
|
+
def self.best_image(images, base_url)
|
9
|
+
imgs = images.map{|i| Image.new(i, base_url)}
|
10
|
+
top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
|
11
|
+
top_image.absolute_url if top_image
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(image, base_url)
|
15
|
+
@src = image["src"]
|
16
|
+
@width = image["width"].to_i
|
17
|
+
@height = image["height"].to_i
|
18
|
+
@base_url = base_url
|
19
|
+
end
|
20
|
+
|
21
|
+
def interesting_css?
|
22
|
+
valid_image?(@width, @height)
|
23
|
+
end
|
24
|
+
|
25
|
+
def interesting_file?
|
26
|
+
open(absolute_url, "rb") do |fh|
|
27
|
+
is = ImageSize.new(fh.read)
|
28
|
+
return valid_image?(is.width, is.height)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def valid_image?(width, height)
|
33
|
+
big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
|
34
|
+
end
|
35
|
+
|
36
|
+
def allows_hotlinking?
|
37
|
+
begin
|
38
|
+
open(absolute_url, "Referer" => "http://splitstate.com")
|
39
|
+
rescue OpenURI::HTTPError, SocketError
|
40
|
+
return false
|
41
|
+
end
|
42
|
+
true
|
43
|
+
end
|
44
|
+
|
45
|
+
def advertising?(width, height)
|
46
|
+
@src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
|
47
|
+
end
|
48
|
+
|
49
|
+
def not_advertising?(width, height)
|
50
|
+
!advertising?(width, height)
|
51
|
+
end
|
52
|
+
|
53
|
+
def big_enough?(width, height)
|
54
|
+
width > MIN_WIDTH && height > MIN_HEIGHT
|
55
|
+
end
|
56
|
+
|
57
|
+
def absolute_url
|
58
|
+
URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
module MechanizeContent
|
2
|
+
class Page
|
3
|
+
attr_accessor :url
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
end
|
8
|
+
|
9
|
+
def title
|
10
|
+
content.title if content
|
11
|
+
end
|
12
|
+
|
13
|
+
def text
|
14
|
+
Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
|
15
|
+
end
|
16
|
+
|
17
|
+
def image
|
18
|
+
@image ||= best_content ? Image.best_image(images, base_url) : nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def images
|
22
|
+
best_content.css('img')
|
23
|
+
end
|
24
|
+
|
25
|
+
def base_url
|
26
|
+
base = content.parser.xpath("//base/@href").first
|
27
|
+
base ? base.value : content.uri
|
28
|
+
end
|
29
|
+
|
30
|
+
def best_content
|
31
|
+
@best_content ||= find_content
|
32
|
+
end
|
33
|
+
|
34
|
+
def find_content
|
35
|
+
return nil unless content
|
36
|
+
doc = content.parser
|
37
|
+
readability = {}
|
38
|
+
doc.css('p').each do |paragraph|
|
39
|
+
if readability[paragraph.parent].nil?
|
40
|
+
readability[paragraph.parent] = 0
|
41
|
+
end
|
42
|
+
parent_class = paragraph.parent['class'] || ""
|
43
|
+
parent_id = paragraph.parent['id'] || ""
|
44
|
+
if !parent_class.match('(comment|meta|footer|footnote)').nil?
|
45
|
+
readability[paragraph.parent] -= 50
|
46
|
+
elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
47
|
+
readability[paragraph.parent] += 25
|
48
|
+
end
|
49
|
+
|
50
|
+
if !parent_id.match('(comment|meta|footer|footnote)').nil?
|
51
|
+
readability[paragraph.parent] -= 50
|
52
|
+
elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
53
|
+
readability[paragraph.parent] += 25
|
54
|
+
end
|
55
|
+
|
56
|
+
if paragraph.inner_text().length > 10
|
57
|
+
readability[paragraph.parent] += 1
|
58
|
+
end
|
59
|
+
if !paragraph.parent.attributes.values.nil?
|
60
|
+
if !paragraph.parent.attributes.values.first.nil?
|
61
|
+
if paragraph.parent.attributes.values.first.value.include? "comment"
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
67
|
+
end
|
68
|
+
sorted_results = readability.sort_by { |parent,score| -score }
|
69
|
+
if sorted_results.nil? || sorted_results.first.nil?
|
70
|
+
return nil
|
71
|
+
elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
|
72
|
+
return nil
|
73
|
+
else
|
74
|
+
top_result = sorted_results.first.first
|
75
|
+
top_result.css('script').unlink
|
76
|
+
top_result.css('iframe').unlink
|
77
|
+
top_result.css('h1').unlink
|
78
|
+
top_result.css('h2').unlink
|
79
|
+
top_result.css("div#date-byline").unlink
|
80
|
+
top_result.css("p.date").unlink
|
81
|
+
top_result.css("div#facebook-like-button").unlink
|
82
|
+
return top_result
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def content
|
88
|
+
@page_content ||= fetch_content
|
89
|
+
end
|
90
|
+
|
91
|
+
def fetch_content
|
92
|
+
begin
|
93
|
+
page_content = agent.get(@url)
|
94
|
+
page_content if page_content.is_a?(Mechanize::Page)
|
95
|
+
rescue Timeout::Error
|
96
|
+
puts "Timeout - "+@url
|
97
|
+
rescue Errno::ECONNRESET
|
98
|
+
puts "Connection reset by peer - "+@url
|
99
|
+
rescue Mechanize::ResponseCodeError
|
100
|
+
puts "Invalid url"
|
101
|
+
rescue Mechanize::UnsupportedSchemeError
|
102
|
+
puts "Unsupported Scheme"
|
103
|
+
rescue SocketError => e
|
104
|
+
puts e
|
105
|
+
# rescue
|
106
|
+
# puts "There was a problem connecting - "+@url
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def agent
|
111
|
+
@agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
@@ -1,35 +1,8 @@
|
|
1
1
|
module MechanizeContent
|
2
|
-
class Util
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
AD_WIDTH = 728
|
7
|
-
AD_HEIGHT = 90
|
8
|
-
|
9
|
-
def self.get_base_url(doc, url)
|
10
|
-
base_url = doc.xpath("//base/@href").first
|
11
|
-
if base_url.nil?
|
12
|
-
return url
|
13
|
-
else
|
14
|
-
return base_url.value
|
15
|
-
end
|
2
|
+
class Util
|
3
|
+
def self.force_utf8(string)
|
4
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
5
|
+
ic.iconv(string.delete("\t").delete("\n").strip + ' ')[0..-2]
|
16
6
|
end
|
17
|
-
|
18
|
-
def self.build_absolute_url(current_src, url)
|
19
|
-
if URI.parse(current_src).relative?
|
20
|
-
current_src = (URI.parse(url.to_s)+current_src).to_s
|
21
|
-
end
|
22
|
-
current_src
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.valid_image?(width, height, src)
|
26
|
-
if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
|
27
|
-
if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
|
28
|
-
return true
|
29
|
-
end
|
30
|
-
end
|
31
|
-
return false
|
32
|
-
end
|
33
|
-
|
34
7
|
end
|
35
8
|
end
|
data/lib/mechanize_content.rb
CHANGED
@@ -3,187 +3,27 @@ require 'mechanize'
|
|
3
3
|
require 'image_size'
|
4
4
|
require 'open-uri'
|
5
5
|
require 'mechanize_content/util'
|
6
|
+
require 'mechanize_content/page'
|
7
|
+
require 'mechanize_content/image'
|
6
8
|
|
7
9
|
module MechanizeContent
|
8
10
|
class Parser
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
attr_accessor :pages
|
12
|
+
|
12
13
|
def initialize(*args)
|
13
|
-
@
|
14
|
+
@pages = *args.flatten.map{|url| Page.new(url)}
|
14
15
|
end
|
15
16
|
|
16
17
|
def best_title
|
17
|
-
@
|
18
|
+
@pages.map{|page| page.title}.compact.first || @pages.first.url
|
18
19
|
end
|
19
20
|
|
20
21
|
def best_text
|
21
|
-
@
|
22
|
+
@pages.map{|page| page.text}.compact.first
|
22
23
|
end
|
23
24
|
|
24
25
|
def best_image
|
25
|
-
@
|
26
|
-
end
|
27
|
-
|
28
|
-
def fetch_images
|
29
|
-
(@pages || fetch_pages).each do |page|
|
30
|
-
image = fetch_image(page)
|
31
|
-
return image unless image.nil?
|
32
|
-
end
|
33
|
-
return nil
|
34
|
-
end
|
35
|
-
|
36
|
-
def fetch_texts
|
37
|
-
(@pages || fetch_pages).each do |page|
|
38
|
-
text = fetch_text(page)
|
39
|
-
return text unless text.nil? || text.empty?
|
40
|
-
end
|
41
|
-
return nil
|
42
|
-
end
|
43
|
-
|
44
|
-
def fetch_titles
|
45
|
-
(@pages || fetch_pages).each do |page|
|
46
|
-
title = page.title
|
47
|
-
unless title.nil?
|
48
|
-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
49
|
-
title = ic.iconv(title + ' ')[0..-2]
|
50
|
-
return title
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
return @urls.first
|
55
|
-
end
|
56
|
-
|
57
|
-
def fetch_pages
|
58
|
-
@pages = []
|
59
|
-
@urls.each do |url|
|
60
|
-
page = fetch_page(url)
|
61
|
-
@pages << page unless page.nil?
|
62
|
-
end
|
63
|
-
@pages
|
64
|
-
end
|
65
|
-
|
66
|
-
def fetch_page(url)
|
67
|
-
begin
|
68
|
-
page = (@agent || init_agent).get(url)
|
69
|
-
if page.class == Mechanize::Page
|
70
|
-
return page
|
71
|
-
else
|
72
|
-
return nil
|
73
|
-
end
|
74
|
-
rescue Timeout::Error
|
75
|
-
puts "Timeout - "+url
|
76
|
-
rescue Errno::ECONNRESET
|
77
|
-
puts "Connection reset by peer - "+url
|
78
|
-
rescue Mechanize::ResponseCodeError
|
79
|
-
puts "Invalid url"
|
80
|
-
rescue Mechanize::UnsupportedSchemeError
|
81
|
-
puts "Unsupported Scheme"
|
82
|
-
rescue
|
83
|
-
puts "There was a problem connecting - "+url
|
84
|
-
end
|
26
|
+
@pages.map{|page| page.image}.compact.first
|
85
27
|
end
|
86
|
-
|
87
|
-
def init_agent
|
88
|
-
agent = Mechanize.new
|
89
|
-
agent.user_agent_alias = 'Mac Safari'
|
90
|
-
return @agent = agent
|
91
|
-
end
|
92
|
-
|
93
|
-
def fetch_text(page)
|
94
|
-
top_content = fetch_content(page)
|
95
|
-
if top_content
|
96
|
-
text = top_content.text.delete("\t").delete("\n").strip
|
97
|
-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
98
|
-
text = ic.iconv(text + ' ')[0..-2]
|
99
|
-
else
|
100
|
-
return nil
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
def fetch_content(page)
|
105
|
-
doc = page.parser
|
106
|
-
readability = {}
|
107
|
-
doc.css('p').each do |paragraph|
|
108
|
-
if readability[paragraph.parent].nil?
|
109
|
-
readability[paragraph.parent] = 0
|
110
|
-
end
|
111
|
-
parent_class = paragraph.parent['class'] || ""
|
112
|
-
parent_id = paragraph.parent['id'] || ""
|
113
|
-
if !parent_class.match('(comment|meta|footer|footnote)').nil?
|
114
|
-
readability[paragraph.parent] -= 50
|
115
|
-
elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
116
|
-
readability[paragraph.parent] += 25
|
117
|
-
end
|
118
|
-
|
119
|
-
if !parent_id.match('(comment|meta|footer|footnote)').nil?
|
120
|
-
readability[paragraph.parent] -= 50
|
121
|
-
elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
|
122
|
-
readability[paragraph.parent] += 25
|
123
|
-
end
|
124
|
-
|
125
|
-
if paragraph.inner_text().length > 10
|
126
|
-
readability[paragraph.parent] += 1
|
127
|
-
end
|
128
|
-
if !paragraph.parent.attributes.values.nil?
|
129
|
-
if !paragraph.parent.attributes.values.first.nil?
|
130
|
-
if paragraph.parent.attributes.values.first.value.include? "comment"
|
131
|
-
break
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
|
-
readability[paragraph.parent] += paragraph.inner_text().count(',')
|
136
|
-
end
|
137
|
-
sorted_results = readability.sort_by { |parent,score| -score }
|
138
|
-
if sorted_results.nil? || sorted_results.first.nil?
|
139
|
-
return nil
|
140
|
-
elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
|
141
|
-
return nil
|
142
|
-
else
|
143
|
-
top_result = sorted_results.first.first
|
144
|
-
top_result.css('script').unlink
|
145
|
-
top_result.css('iframe').unlink
|
146
|
-
top_result.css('h1').unlink
|
147
|
-
top_result.css('h2').unlink
|
148
|
-
return top_result
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
def fetch_image(page)
|
153
|
-
top_content = fetch_content(page)
|
154
|
-
if top_content
|
155
|
-
return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
|
156
|
-
else
|
157
|
-
return nil
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
def find_best_image(all_images, url)
|
162
|
-
begin
|
163
|
-
current_src = nil
|
164
|
-
all_images.each do |img|
|
165
|
-
current_src = img["src"]
|
166
|
-
if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
|
167
|
-
return Util.build_absolute_url(current_src, url)
|
168
|
-
end
|
169
|
-
end
|
170
|
-
all_images.each do |img|
|
171
|
-
current_src = img["src"]
|
172
|
-
current_src = Util.build_absolute_url(current_src, url)
|
173
|
-
open(current_src, "rb") do |fh|
|
174
|
-
is = ImageSize.new(fh.read)
|
175
|
-
if Util.valid_image?(is.width, is.height, current_src)
|
176
|
-
return current_src
|
177
|
-
end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
return nil
|
181
|
-
rescue Errno::ENOENT
|
182
|
-
puts "No such file - " + current_src
|
183
|
-
rescue
|
184
|
-
puts "There was a problem connecting - " + current_src
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
28
|
end
|
189
29
|
end
|
data/mechanize_content.gemspec
CHANGED
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
s.add_dependency("mechanize", "~> 1.0.0")
|
22
22
|
s.add_dependency("imagesize", "~> 0.1.1")
|
23
|
-
s.add_development_dependency('rspec', "~> 2.
|
23
|
+
s.add_development_dependency('rspec', "~> 2.6.0")
|
24
|
+
s.add_development_dependency('vcr', "~> 1.9.0")
|
24
25
|
s.add_development_dependency('fakeweb', "~> 1.3.0")
|
25
26
|
end
|