mechanize_content 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -1
- data/lib/mechanize_content/image.rb +12 -12
- data/lib/mechanize_content/version.rb +1 -1
- data/spec/mechanize_content/image_spec.rb +15 -5
- metadata +59 -59
data/Gemfile
CHANGED
@@ -4,35 +4,35 @@ module MechanizeContent
|
|
4
4
|
MIN_HEIGHT = 64
|
5
5
|
AD_WIDTH = 728
|
6
6
|
AD_HEIGHT = 90
|
7
|
-
|
7
|
+
|
8
8
|
def self.best_image(images, base_url)
|
9
9
|
imgs = images.map{|i| Image.new(i, base_url)}
|
10
10
|
top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
|
11
11
|
top_image.absolute_url if top_image
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def initialize(image, base_url)
|
15
|
-
@src = image["src"]
|
15
|
+
@src = URI.escape(image["src"])
|
16
16
|
@width = image["width"].to_i
|
17
17
|
@height = image["height"].to_i
|
18
18
|
@base_url = base_url
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
def interesting_css?
|
22
22
|
valid_image?(@width, @height)
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
def interesting_file?
|
26
26
|
open(absolute_url, "rb") do |fh|
|
27
27
|
is = ImageSize.new(fh.read)
|
28
28
|
return valid_image?(is.width, is.height)
|
29
29
|
end
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
def valid_image?(width, height)
|
33
33
|
big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
def allows_hotlinking?
|
37
37
|
begin
|
38
38
|
open(absolute_url, "Referer" => "http://splitstate.com")
|
@@ -41,21 +41,21 @@ module MechanizeContent
|
|
41
41
|
end
|
42
42
|
true
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
def advertising?(width, height)
|
46
46
|
@src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
def not_advertising?(width, height)
|
50
50
|
!advertising?(width, height)
|
51
51
|
end
|
52
|
-
|
52
|
+
|
53
53
|
def big_enough?(width, height)
|
54
54
|
width > MIN_WIDTH && height > MIN_HEIGHT
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
def absolute_url
|
58
58
|
URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
|
59
|
-
end
|
59
|
+
end
|
60
60
|
end
|
61
61
|
end
|
@@ -2,21 +2,31 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe MechanizeContent::Image do
|
4
4
|
use_vcr_cassette :record => :new_episodes
|
5
|
-
|
5
|
+
|
6
|
+
describe "#absolute_url" do
|
7
|
+
context "given a uri with a space" do
|
8
|
+
it "will be escaped" do
|
9
|
+
img = {"src" => "http://media.giantbomb.com/uploads/0/26/10180-psn icon_middle.jpg", "width" => 280, "height" => 283}
|
10
|
+
image = MechanizeContent::Image.new(img, "http://www.giantbomb.com/news/the-slow-motion-ballet-of-death-in-max-payne-3/3721/")
|
11
|
+
image.absolute_url.should eq("http://media.giantbomb.com/uploads/0/26/10180-psn%20icon_middle.jpg")
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
6
16
|
describe "#interesting_css?" do
|
7
17
|
context "given a gif" do
|
8
18
|
it "is not interesting" do
|
9
19
|
img = {"src" => "http://www.cmpevents.com/GD10/ablank.gif2", "width" => 500, "height" => 500}
|
10
20
|
image = MechanizeContent::Image.new(img, "https://www.cmpevents.com")
|
11
|
-
image.should_not be_interesting_css
|
21
|
+
image.should_not be_interesting_css
|
12
22
|
end
|
13
23
|
end
|
14
|
-
|
24
|
+
|
15
25
|
context "given a banner" do
|
16
26
|
it "is not interesting" do
|
17
27
|
img = {"src" => "http://www.cmpevents.com/GD10/banner.png", "width" => 500, "height" => 500}
|
18
28
|
image = MechanizeContent::Image.new(img, "https://www.cmpevents.com")
|
19
|
-
image.should_not be_interesting_css
|
29
|
+
image.should_not be_interesting_css
|
20
30
|
end
|
21
31
|
end
|
22
32
|
|
@@ -35,5 +45,5 @@ describe MechanizeContent::Image do
|
|
35
45
|
image.should be_interesting_css
|
36
46
|
end
|
37
47
|
end
|
38
|
-
end
|
48
|
+
end
|
39
49
|
end
|
metadata
CHANGED
@@ -1,82 +1,79 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: mechanize_content
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.2
|
4
5
|
prerelease:
|
5
|
-
version: 0.3.1
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- John Griffin
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2011-10-09 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
16
15
|
name: mechanize
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &70365206410400 !ruby/object:Gem::Requirement
|
19
17
|
none: false
|
20
|
-
requirements:
|
18
|
+
requirements:
|
21
19
|
- - ~>
|
22
|
-
- !ruby/object:Gem::Version
|
20
|
+
- !ruby/object:Gem::Version
|
23
21
|
version: 2.0.1
|
24
22
|
type: :runtime
|
25
|
-
version_requirements: *id001
|
26
|
-
- !ruby/object:Gem::Dependency
|
27
|
-
name: imagesize
|
28
23
|
prerelease: false
|
29
|
-
|
24
|
+
version_requirements: *70365206410400
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: imagesize
|
27
|
+
requirement: &70365206409620 !ruby/object:Gem::Requirement
|
30
28
|
none: false
|
31
|
-
requirements:
|
29
|
+
requirements:
|
32
30
|
- - ~>
|
33
|
-
- !ruby/object:Gem::Version
|
31
|
+
- !ruby/object:Gem::Version
|
34
32
|
version: 0.1.1
|
35
33
|
type: :runtime
|
36
|
-
version_requirements: *id002
|
37
|
-
- !ruby/object:Gem::Dependency
|
38
|
-
name: rspec
|
39
34
|
prerelease: false
|
40
|
-
|
35
|
+
version_requirements: *70365206409620
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &70365206408940 !ruby/object:Gem::Requirement
|
41
39
|
none: false
|
42
|
-
requirements:
|
40
|
+
requirements:
|
43
41
|
- - ~>
|
44
|
-
- !ruby/object:Gem::Version
|
42
|
+
- !ruby/object:Gem::Version
|
45
43
|
version: 2.6.0
|
46
44
|
type: :development
|
47
|
-
version_requirements: *id003
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: vcr
|
50
45
|
prerelease: false
|
51
|
-
|
46
|
+
version_requirements: *70365206408940
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: vcr
|
49
|
+
requirement: &70365206408160 !ruby/object:Gem::Requirement
|
52
50
|
none: false
|
53
|
-
requirements:
|
51
|
+
requirements:
|
54
52
|
- - ~>
|
55
|
-
- !ruby/object:Gem::Version
|
53
|
+
- !ruby/object:Gem::Version
|
56
54
|
version: 1.10.0
|
57
55
|
type: :development
|
58
|
-
version_requirements: *id004
|
59
|
-
- !ruby/object:Gem::Dependency
|
60
|
-
name: fakeweb
|
61
56
|
prerelease: false
|
62
|
-
|
57
|
+
version_requirements: *70365206408160
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: fakeweb
|
60
|
+
requirement: &70365206407500 !ruby/object:Gem::Requirement
|
63
61
|
none: false
|
64
|
-
requirements:
|
62
|
+
requirements:
|
65
63
|
- - ~>
|
66
|
-
- !ruby/object:Gem::Version
|
64
|
+
- !ruby/object:Gem::Version
|
67
65
|
version: 1.3.0
|
68
66
|
type: :development
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70365206407500
|
69
|
+
description: pass in a url or urls and mechanize-content will select the best block
|
70
|
+
of text, image and title by analysing the page content
|
71
|
+
email:
|
72
72
|
- johnog@gmail.com
|
73
73
|
executables: []
|
74
|
-
|
75
74
|
extensions: []
|
76
|
-
|
77
75
|
extra_rdoc_files: []
|
78
|
-
|
79
|
-
files:
|
76
|
+
files:
|
80
77
|
- .gitignore
|
81
78
|
- .rspec
|
82
79
|
- .rvmrc
|
@@ -98,32 +95,35 @@ files:
|
|
98
95
|
- spec/spec_helper.rb
|
99
96
|
homepage: http://github.com/john-griffin/mechanize-content
|
100
97
|
licenses: []
|
101
|
-
|
102
98
|
post_install_message:
|
103
99
|
rdoc_options: []
|
104
|
-
|
105
|
-
require_paths:
|
100
|
+
require_paths:
|
106
101
|
- lib
|
107
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
103
|
none: false
|
109
|
-
requirements:
|
110
|
-
- -
|
111
|
-
- !ruby/object:Gem::Version
|
112
|
-
version:
|
113
|
-
|
104
|
+
requirements:
|
105
|
+
- - ! '>='
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
segments:
|
109
|
+
- 0
|
110
|
+
hash: -160408397260513251
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
114
112
|
none: false
|
115
|
-
requirements:
|
116
|
-
- -
|
117
|
-
- !ruby/object:Gem::Version
|
118
|
-
version:
|
113
|
+
requirements:
|
114
|
+
- - ! '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
segments:
|
118
|
+
- 0
|
119
|
+
hash: -160408397260513251
|
119
120
|
requirements: []
|
120
|
-
|
121
121
|
rubyforge_project: mechanize_content
|
122
|
-
rubygems_version: 1.8.
|
122
|
+
rubygems_version: 1.8.7
|
123
123
|
signing_key:
|
124
124
|
specification_version: 3
|
125
125
|
summary: scrape the best content from a page
|
126
|
-
test_files:
|
126
|
+
test_files:
|
127
127
|
- spec/cassettes/MechanizeContent.yml
|
128
128
|
- spec/cassettes/MechanizeContent_Image.yml
|
129
129
|
- spec/mechanize_content/image_spec.rb
|