mechanize_content 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. data/.rvmrc +1 -0
  2. data/Rakefile +5 -0
  3. data/lib/mechanize_content/image.rb +61 -0
  4. data/lib/mechanize_content/page.rb +115 -0
  5. data/lib/mechanize_content/util.rb +4 -31
  6. data/lib/mechanize_content/version.rb +1 -1
  7. data/lib/mechanize_content.rb +8 -168
  8. data/mechanize_content.gemspec +2 -1
  9. data/spec/cassettes/MechanizeContent.yml +33742 -0
  10. data/spec/cassettes/MechanizeContent_Image.yml +279 -0
  11. data/spec/mechanize_content/image_spec.rb +39 -0
  12. data/spec/mechanize_content/page_spec.rb +90 -0
  13. data/spec/mechanize_content_spec.rb +149 -0
  14. data/spec/spec_helper.rb +11 -1
  15. metadata +29 -45
  16. data/spec/fixtures/a-fistful-of-red-dead-redemption-ps3-for-a-few-dollars-less-on.html +0 -754
  17. data/spec/fixtures/another-world-15th-anniversary-edition-now-on-gog-com.html +0 -2416
  18. data/spec/fixtures/another_world_15th_anniversary_edition.html +0 -805
  19. data/spec/fixtures/cmp.html +0 -333
  20. data/spec/fixtures/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april.html +0 -1593
  21. data/spec/fixtures/gdc_2010_rounds_off_indie_cove.html +0 -698
  22. data/spec/fixtures/google.html +0 -42
  23. data/spec/fixtures/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3.html +0 -1012
  24. data/spec/fixtures/johnny.jpg +0 -0
  25. data/spec/fixtures/joystiq-xbox-usb-support-580.jpg +0 -0
  26. data/spec/fixtures/mutiny.html +0 -264
  27. data/spec/fixtures/nuff-said-good-old-games-gets-another-world-168150.html +0 -5492
  28. data/spec/fixtures/rock-band-3-out-this-holiday-will-revolutionize-genre.html +0 -1157
  29. data/spec/fixtures/rockband_facebook.html +0 -93
  30. data/spec/fixtures/spartan.html +0 -391
  31. data/spec/fixtures/techmeme.html +0 -2216
  32. data/spec/fixtures/time-warner-retail-egm.html +0 -49
  33. data/spec/fixtures/witcher.html +0 -458
  34. data/spec/fixtures/xbox-360-gaining-usb-storage-support-in-2010-update.html +0 -2462
  35. data/spec/mechanize-content_spec.rb +0 -202
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.2@mechanize_content
data/Rakefile CHANGED
@@ -1,2 +1,7 @@
1
1
  require 'bundler'
2
2
  Bundler::GemHelper.install_tasks
3
+
4
+ desc "Open an irb session preloaded with this library"
5
+ task :console do
6
+ sh "irb -rubygems -I lib -r mechanize_content.rb"
7
+ end
@@ -0,0 +1,61 @@
1
+ module MechanizeContent
2
+ class Image
3
+ MIN_WIDTH = 64
4
+ MIN_HEIGHT = 64
5
+ AD_WIDTH = 728
6
+ AD_HEIGHT = 90
7
+
8
+ def self.best_image(images, base_url)
9
+ imgs = images.map{|i| Image.new(i, base_url)}
10
+ top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
11
+ top_image.absolute_url if top_image
12
+ end
13
+
14
+ def initialize(image, base_url)
15
+ @src = image["src"]
16
+ @width = image["width"].to_i
17
+ @height = image["height"].to_i
18
+ @base_url = base_url
19
+ end
20
+
21
+ def interesting_css?
22
+ valid_image?(@width, @height)
23
+ end
24
+
25
+ def interesting_file?
26
+ open(absolute_url, "rb") do |fh|
27
+ is = ImageSize.new(fh.read)
28
+ return valid_image?(is.width, is.height)
29
+ end
30
+ end
31
+
32
+ def valid_image?(width, height)
33
+ big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
34
+ end
35
+
36
+ def allows_hotlinking?
37
+ begin
38
+ open(absolute_url, "Referer" => "http://splitstate.com")
39
+ rescue OpenURI::HTTPError, SocketError
40
+ return false
41
+ end
42
+ true
43
+ end
44
+
45
+ def advertising?(width, height)
46
+ @src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
47
+ end
48
+
49
+ def not_advertising?(width, height)
50
+ !advertising?(width, height)
51
+ end
52
+
53
+ def big_enough?(width, height)
54
+ width > MIN_WIDTH && height > MIN_HEIGHT
55
+ end
56
+
57
+ def absolute_url
58
+ URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,115 @@
1
+ module MechanizeContent
2
+ class Page
3
+ attr_accessor :url
4
+
5
+ def initialize(url)
6
+ @url = url
7
+ end
8
+
9
+ def title
10
+ content.title if content
11
+ end
12
+
13
+ def text
14
+ Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
15
+ end
16
+
17
+ def image
18
+ @image ||= best_content ? Image.best_image(images, base_url) : nil
19
+ end
20
+
21
+ def images
22
+ best_content.css('img')
23
+ end
24
+
25
+ def base_url
26
+ base = content.parser.xpath("//base/@href").first
27
+ base ? base.value : content.uri
28
+ end
29
+
30
+ def best_content
31
+ @best_content ||= find_content
32
+ end
33
+
34
+ def find_content
35
+ return nil unless content
36
+ doc = content.parser
37
+ readability = {}
38
+ doc.css('p').each do |paragraph|
39
+ if readability[paragraph.parent].nil?
40
+ readability[paragraph.parent] = 0
41
+ end
42
+ parent_class = paragraph.parent['class'] || ""
43
+ parent_id = paragraph.parent['id'] || ""
44
+ if !parent_class.match('(comment|meta|footer|footnote)').nil?
45
+ readability[paragraph.parent] -= 50
46
+ elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
47
+ readability[paragraph.parent] += 25
48
+ end
49
+
50
+ if !parent_id.match('(comment|meta|footer|footnote)').nil?
51
+ readability[paragraph.parent] -= 50
52
+ elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
53
+ readability[paragraph.parent] += 25
54
+ end
55
+
56
+ if paragraph.inner_text().length > 10
57
+ readability[paragraph.parent] += 1
58
+ end
59
+ if !paragraph.parent.attributes.values.nil?
60
+ if !paragraph.parent.attributes.values.first.nil?
61
+ if paragraph.parent.attributes.values.first.value.include? "comment"
62
+ break
63
+ end
64
+ end
65
+ end
66
+ readability[paragraph.parent] += paragraph.inner_text().count(',')
67
+ end
68
+ sorted_results = readability.sort_by { |parent,score| -score }
69
+ if sorted_results.nil? || sorted_results.first.nil?
70
+ return nil
71
+ elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
72
+ return nil
73
+ else
74
+ top_result = sorted_results.first.first
75
+ top_result.css('script').unlink
76
+ top_result.css('iframe').unlink
77
+ top_result.css('h1').unlink
78
+ top_result.css('h2').unlink
79
+ top_result.css("div#date-byline").unlink
80
+ top_result.css("p.date").unlink
81
+ top_result.css("div#facebook-like-button").unlink
82
+ return top_result
83
+ end
84
+ end
85
+
86
+
87
+ def content
88
+ @page_content ||= fetch_content
89
+ end
90
+
91
+ def fetch_content
92
+ begin
93
+ page_content = agent.get(@url)
94
+ page_content if page_content.is_a?(Mechanize::Page)
95
+ rescue Timeout::Error
96
+ puts "Timeout - "+@url
97
+ rescue Errno::ECONNRESET
98
+ puts "Connection reset by peer - "+@url
99
+ rescue Mechanize::ResponseCodeError
100
+ puts "Invalid url"
101
+ rescue Mechanize::UnsupportedSchemeError
102
+ puts "Unsupported Scheme"
103
+ rescue SocketError => e
104
+ puts e
105
+ # rescue
106
+ # puts "There was a problem connecting - "+@url
107
+ end
108
+ end
109
+
110
+ def agent
111
+ @agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
112
+ end
113
+
114
+ end
115
+ end
@@ -1,35 +1,8 @@
1
1
  module MechanizeContent
2
- class Util
3
-
4
- MIN_WIDTH = 64
5
- MIN_HEIGHT = 64
6
- AD_WIDTH = 728
7
- AD_HEIGHT = 90
8
-
9
- def self.get_base_url(doc, url)
10
- base_url = doc.xpath("//base/@href").first
11
- if base_url.nil?
12
- return url
13
- else
14
- return base_url.value
15
- end
2
+ class Util
3
+ def self.force_utf8(string)
4
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
5
+ ic.iconv(string.delete("\t").delete("\n").strip + ' ')[0..-2]
16
6
  end
17
-
18
- def self.build_absolute_url(current_src, url)
19
- if URI.parse(current_src).relative?
20
- current_src = (URI.parse(url.to_s)+current_src).to_s
21
- end
22
- current_src
23
- end
24
-
25
- def self.valid_image?(width, height, src)
26
- if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
27
- if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
28
- return true
29
- end
30
- end
31
- return false
32
- end
33
-
34
7
  end
35
8
  end
@@ -1,3 +1,3 @@
1
1
  module MechanizeContent
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -3,187 +3,27 @@ require 'mechanize'
3
3
  require 'image_size'
4
4
  require 'open-uri'
5
5
  require 'mechanize_content/util'
6
+ require 'mechanize_content/page'
7
+ require 'mechanize_content/image'
6
8
 
7
9
  module MechanizeContent
8
10
  class Parser
9
-
10
- attr_accessor :urls
11
-
11
+ attr_accessor :pages
12
+
12
13
  def initialize(*args)
13
- @urls = *args.flatten
14
+ @pages = *args.flatten.map{|url| Page.new(url)}
14
15
  end
15
16
 
16
17
  def best_title
17
- @best_title ||= fetch_titles
18
+ @pages.map{|page| page.title}.compact.first || @pages.first.url
18
19
  end
19
20
 
20
21
  def best_text
21
- @best_text ||= fetch_texts
22
+ @pages.map{|page| page.text}.compact.first
22
23
  end
23
24
 
24
25
  def best_image
25
- @best_image ||= fetch_images
26
- end
27
-
28
- def fetch_images
29
- (@pages || fetch_pages).each do |page|
30
- image = fetch_image(page)
31
- return image unless image.nil?
32
- end
33
- return nil
34
- end
35
-
36
- def fetch_texts
37
- (@pages || fetch_pages).each do |page|
38
- text = fetch_text(page)
39
- return text unless text.nil? || text.empty?
40
- end
41
- return nil
42
- end
43
-
44
- def fetch_titles
45
- (@pages || fetch_pages).each do |page|
46
- title = page.title
47
- unless title.nil?
48
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
49
- title = ic.iconv(title + ' ')[0..-2]
50
- return title
51
- end
52
-
53
- end
54
- return @urls.first
55
- end
56
-
57
- def fetch_pages
58
- @pages = []
59
- @urls.each do |url|
60
- page = fetch_page(url)
61
- @pages << page unless page.nil?
62
- end
63
- @pages
64
- end
65
-
66
- def fetch_page(url)
67
- begin
68
- page = (@agent || init_agent).get(url)
69
- if page.class == Mechanize::Page
70
- return page
71
- else
72
- return nil
73
- end
74
- rescue Timeout::Error
75
- puts "Timeout - "+url
76
- rescue Errno::ECONNRESET
77
- puts "Connection reset by peer - "+url
78
- rescue Mechanize::ResponseCodeError
79
- puts "Invalid url"
80
- rescue Mechanize::UnsupportedSchemeError
81
- puts "Unsupported Scheme"
82
- rescue
83
- puts "There was a problem connecting - "+url
84
- end
26
+ @pages.map{|page| page.image}.compact.first
85
27
  end
86
-
87
- def init_agent
88
- agent = Mechanize.new
89
- agent.user_agent_alias = 'Mac Safari'
90
- return @agent = agent
91
- end
92
-
93
- def fetch_text(page)
94
- top_content = fetch_content(page)
95
- if top_content
96
- text = top_content.text.delete("\t").delete("\n").strip
97
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
98
- text = ic.iconv(text + ' ')[0..-2]
99
- else
100
- return nil
101
- end
102
- end
103
-
104
- def fetch_content(page)
105
- doc = page.parser
106
- readability = {}
107
- doc.css('p').each do |paragraph|
108
- if readability[paragraph.parent].nil?
109
- readability[paragraph.parent] = 0
110
- end
111
- parent_class = paragraph.parent['class'] || ""
112
- parent_id = paragraph.parent['id'] || ""
113
- if !parent_class.match('(comment|meta|footer|footnote)').nil?
114
- readability[paragraph.parent] -= 50
115
- elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
116
- readability[paragraph.parent] += 25
117
- end
118
-
119
- if !parent_id.match('(comment|meta|footer|footnote)').nil?
120
- readability[paragraph.parent] -= 50
121
- elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
122
- readability[paragraph.parent] += 25
123
- end
124
-
125
- if paragraph.inner_text().length > 10
126
- readability[paragraph.parent] += 1
127
- end
128
- if !paragraph.parent.attributes.values.nil?
129
- if !paragraph.parent.attributes.values.first.nil?
130
- if paragraph.parent.attributes.values.first.value.include? "comment"
131
- break
132
- end
133
- end
134
- end
135
- readability[paragraph.parent] += paragraph.inner_text().count(',')
136
- end
137
- sorted_results = readability.sort_by { |parent,score| -score }
138
- if sorted_results.nil? || sorted_results.first.nil?
139
- return nil
140
- elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
141
- return nil
142
- else
143
- top_result = sorted_results.first.first
144
- top_result.css('script').unlink
145
- top_result.css('iframe').unlink
146
- top_result.css('h1').unlink
147
- top_result.css('h2').unlink
148
- return top_result
149
- end
150
- end
151
-
152
- def fetch_image(page)
153
- top_content = fetch_content(page)
154
- if top_content
155
- return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
156
- else
157
- return nil
158
- end
159
- end
160
-
161
- def find_best_image(all_images, url)
162
- begin
163
- current_src = nil
164
- all_images.each do |img|
165
- current_src = img["src"]
166
- if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
167
- return Util.build_absolute_url(current_src, url)
168
- end
169
- end
170
- all_images.each do |img|
171
- current_src = img["src"]
172
- current_src = Util.build_absolute_url(current_src, url)
173
- open(current_src, "rb") do |fh|
174
- is = ImageSize.new(fh.read)
175
- if Util.valid_image?(is.width, is.height, current_src)
176
- return current_src
177
- end
178
- end
179
- end
180
- return nil
181
- rescue Errno::ENOENT
182
- puts "No such file - " + current_src
183
- rescue
184
- puts "There was a problem connecting - " + current_src
185
- end
186
- end
187
-
188
28
  end
189
29
  end
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
20
20
  s.require_paths = ["lib"]
21
21
  s.add_dependency("mechanize", "~> 1.0.0")
22
22
  s.add_dependency("imagesize", "~> 0.1.1")
23
- s.add_development_dependency('rspec', "~> 2.5.0")
23
+ s.add_development_dependency('rspec', "~> 2.6.0")
24
+ s.add_development_dependency('vcr', "~> 1.9.0")
24
25
  s.add_development_dependency('fakeweb', "~> 1.3.0")
25
26
  end