mechanize_content 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. data/.rvmrc +1 -0
  2. data/Rakefile +5 -0
  3. data/lib/mechanize_content/image.rb +61 -0
  4. data/lib/mechanize_content/page.rb +115 -0
  5. data/lib/mechanize_content/util.rb +4 -31
  6. data/lib/mechanize_content/version.rb +1 -1
  7. data/lib/mechanize_content.rb +8 -168
  8. data/mechanize_content.gemspec +2 -1
  9. data/spec/cassettes/MechanizeContent.yml +33742 -0
  10. data/spec/cassettes/MechanizeContent_Image.yml +279 -0
  11. data/spec/mechanize_content/image_spec.rb +39 -0
  12. data/spec/mechanize_content/page_spec.rb +90 -0
  13. data/spec/mechanize_content_spec.rb +149 -0
  14. data/spec/spec_helper.rb +11 -1
  15. metadata +29 -45
  16. data/spec/fixtures/a-fistful-of-red-dead-redemption-ps3-for-a-few-dollars-less-on.html +0 -754
  17. data/spec/fixtures/another-world-15th-anniversary-edition-now-on-gog-com.html +0 -2416
  18. data/spec/fixtures/another_world_15th_anniversary_edition.html +0 -805
  19. data/spec/fixtures/cmp.html +0 -333
  20. data/spec/fixtures/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april.html +0 -1593
  21. data/spec/fixtures/gdc_2010_rounds_off_indie_cove.html +0 -698
  22. data/spec/fixtures/google.html +0 -42
  23. data/spec/fixtures/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3.html +0 -1012
  24. data/spec/fixtures/johnny.jpg +0 -0
  25. data/spec/fixtures/joystiq-xbox-usb-support-580.jpg +0 -0
  26. data/spec/fixtures/mutiny.html +0 -264
  27. data/spec/fixtures/nuff-said-good-old-games-gets-another-world-168150.html +0 -5492
  28. data/spec/fixtures/rock-band-3-out-this-holiday-will-revolutionize-genre.html +0 -1157
  29. data/spec/fixtures/rockband_facebook.html +0 -93
  30. data/spec/fixtures/spartan.html +0 -391
  31. data/spec/fixtures/techmeme.html +0 -2216
  32. data/spec/fixtures/time-warner-retail-egm.html +0 -49
  33. data/spec/fixtures/witcher.html +0 -458
  34. data/spec/fixtures/xbox-360-gaining-usb-storage-support-in-2010-update.html +0 -2462
  35. data/spec/mechanize-content_spec.rb +0 -202
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.2@mechanize_content
data/Rakefile CHANGED
@@ -1,2 +1,7 @@
1
1
  require 'bundler'
2
2
  Bundler::GemHelper.install_tasks
3
+
4
+ desc "Open an irb session preloaded with this library"
5
+ task :console do
6
+ sh "irb -rubygems -I lib -r mechanize_content.rb"
7
+ end
@@ -0,0 +1,61 @@
1
+ module MechanizeContent
2
+ class Image
3
+ MIN_WIDTH = 64
4
+ MIN_HEIGHT = 64
5
+ AD_WIDTH = 728
6
+ AD_HEIGHT = 90
7
+
8
+ def self.best_image(images, base_url)
9
+ imgs = images.map{|i| Image.new(i, base_url)}
10
+ top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
11
+ top_image.absolute_url if top_image
12
+ end
13
+
14
+ def initialize(image, base_url)
15
+ @src = image["src"]
16
+ @width = image["width"].to_i
17
+ @height = image["height"].to_i
18
+ @base_url = base_url
19
+ end
20
+
21
+ def interesting_css?
22
+ valid_image?(@width, @height)
23
+ end
24
+
25
+ def interesting_file?
26
+ open(absolute_url, "rb") do |fh|
27
+ is = ImageSize.new(fh.read)
28
+ return valid_image?(is.width, is.height)
29
+ end
30
+ end
31
+
32
+ def valid_image?(width, height)
33
+ big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
34
+ end
35
+
36
+ def allows_hotlinking?
37
+ begin
38
+ open(absolute_url, "Referer" => "http://splitstate.com")
39
+ rescue OpenURI::HTTPError, SocketError
40
+ return false
41
+ end
42
+ true
43
+ end
44
+
45
+ def advertising?(width, height)
46
+ @src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
47
+ end
48
+
49
+ def not_advertising?(width, height)
50
+ !advertising?(width, height)
51
+ end
52
+
53
+ def big_enough?(width, height)
54
+ width > MIN_WIDTH && height > MIN_HEIGHT
55
+ end
56
+
57
+ def absolute_url
58
+ URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,115 @@
1
+ module MechanizeContent
2
+ class Page
3
+ attr_accessor :url
4
+
5
+ def initialize(url)
6
+ @url = url
7
+ end
8
+
9
+ def title
10
+ content.title if content
11
+ end
12
+
13
+ def text
14
+ Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
15
+ end
16
+
17
+ def image
18
+ @image ||= best_content ? Image.best_image(images, base_url) : nil
19
+ end
20
+
21
+ def images
22
+ best_content.css('img')
23
+ end
24
+
25
+ def base_url
26
+ base = content.parser.xpath("//base/@href").first
27
+ base ? base.value : content.uri
28
+ end
29
+
30
+ def best_content
31
+ @best_content ||= find_content
32
+ end
33
+
34
+ def find_content
35
+ return nil unless content
36
+ doc = content.parser
37
+ readability = {}
38
+ doc.css('p').each do |paragraph|
39
+ if readability[paragraph.parent].nil?
40
+ readability[paragraph.parent] = 0
41
+ end
42
+ parent_class = paragraph.parent['class'] || ""
43
+ parent_id = paragraph.parent['id'] || ""
44
+ if !parent_class.match('(comment|meta|footer|footnote)').nil?
45
+ readability[paragraph.parent] -= 50
46
+ elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
47
+ readability[paragraph.parent] += 25
48
+ end
49
+
50
+ if !parent_id.match('(comment|meta|footer|footnote)').nil?
51
+ readability[paragraph.parent] -= 50
52
+ elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
53
+ readability[paragraph.parent] += 25
54
+ end
55
+
56
+ if paragraph.inner_text().length > 10
57
+ readability[paragraph.parent] += 1
58
+ end
59
+ if !paragraph.parent.attributes.values.nil?
60
+ if !paragraph.parent.attributes.values.first.nil?
61
+ if paragraph.parent.attributes.values.first.value.include? "comment"
62
+ break
63
+ end
64
+ end
65
+ end
66
+ readability[paragraph.parent] += paragraph.inner_text().count(',')
67
+ end
68
+ sorted_results = readability.sort_by { |parent,score| -score }
69
+ if sorted_results.nil? || sorted_results.first.nil?
70
+ return nil
71
+ elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
72
+ return nil
73
+ else
74
+ top_result = sorted_results.first.first
75
+ top_result.css('script').unlink
76
+ top_result.css('iframe').unlink
77
+ top_result.css('h1').unlink
78
+ top_result.css('h2').unlink
79
+ top_result.css("div#date-byline").unlink
80
+ top_result.css("p.date").unlink
81
+ top_result.css("div#facebook-like-button").unlink
82
+ return top_result
83
+ end
84
+ end
85
+
86
+
87
+ def content
88
+ @page_content ||= fetch_content
89
+ end
90
+
91
+ def fetch_content
92
+ begin
93
+ page_content = agent.get(@url)
94
+ page_content if page_content.is_a?(Mechanize::Page)
95
+ rescue Timeout::Error
96
+ puts "Timeout - "+@url
97
+ rescue Errno::ECONNRESET
98
+ puts "Connection reset by peer - "+@url
99
+ rescue Mechanize::ResponseCodeError
100
+ puts "Invalid url"
101
+ rescue Mechanize::UnsupportedSchemeError
102
+ puts "Unsupported Scheme"
103
+ rescue SocketError => e
104
+ puts e
105
+ # rescue
106
+ # puts "There was a problem connecting - "+@url
107
+ end
108
+ end
109
+
110
+ def agent
111
+ @agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
112
+ end
113
+
114
+ end
115
+ end
@@ -1,35 +1,8 @@
1
1
  module MechanizeContent
2
- class Util
3
-
4
- MIN_WIDTH = 64
5
- MIN_HEIGHT = 64
6
- AD_WIDTH = 728
7
- AD_HEIGHT = 90
8
-
9
- def self.get_base_url(doc, url)
10
- base_url = doc.xpath("//base/@href").first
11
- if base_url.nil?
12
- return url
13
- else
14
- return base_url.value
15
- end
2
+ class Util
3
+ def self.force_utf8(string)
4
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
5
+ ic.iconv(string.delete("\t").delete("\n").strip + ' ')[0..-2]
16
6
  end
17
-
18
- def self.build_absolute_url(current_src, url)
19
- if URI.parse(current_src).relative?
20
- current_src = (URI.parse(url.to_s)+current_src).to_s
21
- end
22
- current_src
23
- end
24
-
25
- def self.valid_image?(width, height, src)
26
- if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
27
- if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
28
- return true
29
- end
30
- end
31
- return false
32
- end
33
-
34
7
  end
35
8
  end
@@ -1,3 +1,3 @@
1
1
  module MechanizeContent
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -3,187 +3,27 @@ require 'mechanize'
3
3
  require 'image_size'
4
4
  require 'open-uri'
5
5
  require 'mechanize_content/util'
6
+ require 'mechanize_content/page'
7
+ require 'mechanize_content/image'
6
8
 
7
9
  module MechanizeContent
8
10
  class Parser
9
-
10
- attr_accessor :urls
11
-
11
+ attr_accessor :pages
12
+
12
13
  def initialize(*args)
13
- @urls = *args.flatten
14
+ @pages = *args.flatten.map{|url| Page.new(url)}
14
15
  end
15
16
 
16
17
  def best_title
17
- @best_title ||= fetch_titles
18
+ @pages.map{|page| page.title}.compact.first || @pages.first.url
18
19
  end
19
20
 
20
21
  def best_text
21
- @best_text ||= fetch_texts
22
+ @pages.map{|page| page.text}.compact.first
22
23
  end
23
24
 
24
25
  def best_image
25
- @best_image ||= fetch_images
26
- end
27
-
28
- def fetch_images
29
- (@pages || fetch_pages).each do |page|
30
- image = fetch_image(page)
31
- return image unless image.nil?
32
- end
33
- return nil
34
- end
35
-
36
- def fetch_texts
37
- (@pages || fetch_pages).each do |page|
38
- text = fetch_text(page)
39
- return text unless text.nil? || text.empty?
40
- end
41
- return nil
42
- end
43
-
44
- def fetch_titles
45
- (@pages || fetch_pages).each do |page|
46
- title = page.title
47
- unless title.nil?
48
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
49
- title = ic.iconv(title + ' ')[0..-2]
50
- return title
51
- end
52
-
53
- end
54
- return @urls.first
55
- end
56
-
57
- def fetch_pages
58
- @pages = []
59
- @urls.each do |url|
60
- page = fetch_page(url)
61
- @pages << page unless page.nil?
62
- end
63
- @pages
64
- end
65
-
66
- def fetch_page(url)
67
- begin
68
- page = (@agent || init_agent).get(url)
69
- if page.class == Mechanize::Page
70
- return page
71
- else
72
- return nil
73
- end
74
- rescue Timeout::Error
75
- puts "Timeout - "+url
76
- rescue Errno::ECONNRESET
77
- puts "Connection reset by peer - "+url
78
- rescue Mechanize::ResponseCodeError
79
- puts "Invalid url"
80
- rescue Mechanize::UnsupportedSchemeError
81
- puts "Unsupported Scheme"
82
- rescue
83
- puts "There was a problem connecting - "+url
84
- end
26
+ @pages.map{|page| page.image}.compact.first
85
27
  end
86
-
87
- def init_agent
88
- agent = Mechanize.new
89
- agent.user_agent_alias = 'Mac Safari'
90
- return @agent = agent
91
- end
92
-
93
- def fetch_text(page)
94
- top_content = fetch_content(page)
95
- if top_content
96
- text = top_content.text.delete("\t").delete("\n").strip
97
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
98
- text = ic.iconv(text + ' ')[0..-2]
99
- else
100
- return nil
101
- end
102
- end
103
-
104
- def fetch_content(page)
105
- doc = page.parser
106
- readability = {}
107
- doc.css('p').each do |paragraph|
108
- if readability[paragraph.parent].nil?
109
- readability[paragraph.parent] = 0
110
- end
111
- parent_class = paragraph.parent['class'] || ""
112
- parent_id = paragraph.parent['id'] || ""
113
- if !parent_class.match('(comment|meta|footer|footnote)').nil?
114
- readability[paragraph.parent] -= 50
115
- elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
116
- readability[paragraph.parent] += 25
117
- end
118
-
119
- if !parent_id.match('(comment|meta|footer|footnote)').nil?
120
- readability[paragraph.parent] -= 50
121
- elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
122
- readability[paragraph.parent] += 25
123
- end
124
-
125
- if paragraph.inner_text().length > 10
126
- readability[paragraph.parent] += 1
127
- end
128
- if !paragraph.parent.attributes.values.nil?
129
- if !paragraph.parent.attributes.values.first.nil?
130
- if paragraph.parent.attributes.values.first.value.include? "comment"
131
- break
132
- end
133
- end
134
- end
135
- readability[paragraph.parent] += paragraph.inner_text().count(',')
136
- end
137
- sorted_results = readability.sort_by { |parent,score| -score }
138
- if sorted_results.nil? || sorted_results.first.nil?
139
- return nil
140
- elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
141
- return nil
142
- else
143
- top_result = sorted_results.first.first
144
- top_result.css('script').unlink
145
- top_result.css('iframe').unlink
146
- top_result.css('h1').unlink
147
- top_result.css('h2').unlink
148
- return top_result
149
- end
150
- end
151
-
152
- def fetch_image(page)
153
- top_content = fetch_content(page)
154
- if top_content
155
- return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
156
- else
157
- return nil
158
- end
159
- end
160
-
161
- def find_best_image(all_images, url)
162
- begin
163
- current_src = nil
164
- all_images.each do |img|
165
- current_src = img["src"]
166
- if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
167
- return Util.build_absolute_url(current_src, url)
168
- end
169
- end
170
- all_images.each do |img|
171
- current_src = img["src"]
172
- current_src = Util.build_absolute_url(current_src, url)
173
- open(current_src, "rb") do |fh|
174
- is = ImageSize.new(fh.read)
175
- if Util.valid_image?(is.width, is.height, current_src)
176
- return current_src
177
- end
178
- end
179
- end
180
- return nil
181
- rescue Errno::ENOENT
182
- puts "No such file - " + current_src
183
- rescue
184
- puts "There was a problem connecting - " + current_src
185
- end
186
- end
187
-
188
28
  end
189
29
  end
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
20
20
  s.require_paths = ["lib"]
21
21
  s.add_dependency("mechanize", "~> 1.0.0")
22
22
  s.add_dependency("imagesize", "~> 0.1.1")
23
- s.add_development_dependency('rspec', "~> 2.5.0")
23
+ s.add_development_dependency('rspec', "~> 2.6.0")
24
+ s.add_development_dependency('vcr', "~> 1.9.0")
24
25
  s.add_development_dependency('fakeweb', "~> 1.3.0")
25
26
  end