link_oracle 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/link_oracle.rb +1 -0
- data/lib/link_oracle/Utils/image_url_formatter.rb +12 -2
- data/lib/link_oracle/extractor/base.rb +4 -3
- data/lib/link_oracle/extractor/body.rb +28 -11
- data/lib/link_oracle/link_data.rb +6 -7
- data/lib/link_oracle/link_data/data.rb +4 -0
- data/lib/link_oracle/version.rb +1 -1
- data/link_oracle.gemspec +2 -1
- data/spec/link_oracle/Utils/image_url_formatter_spec.rb +21 -9
- data/spec/link_oracle/extractor/body_spec.rb +41 -10
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66b779546f8f2c837e044e40d161365baf2ca004
|
4
|
+
data.tar.gz: 96689ad4ed3ac24d666b098c3ea2d4d9c057eb99
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6649f8b62eba20a02792fea7ac4b7b9018fe73c5a2fb3f8dee7587664cf561b4f7e8b5a54e4fbfa02202e019f7e30a8e414c7d708a7bed751a4cd5a26b3b787c
|
7
|
+
data.tar.gz: f22445e5ea8b082f61ee9c15995d4da57fd05e722f16dfe673e0f114c8f1da6bd9f95412b90025e8154b5c4f8947b9290265842edee4bd0564712ec84e2b60d2
|
data/lib/link_oracle.rb
CHANGED
@@ -9,7 +9,13 @@ module Utils
|
|
9
9
|
|
10
10
|
def perform
|
11
11
|
return unless image_url
|
12
|
-
|
12
|
+
if host_missing?
|
13
|
+
"#{scheme}://#{host}#{encoded_image_url}"
|
14
|
+
elsif scheme_missing?
|
15
|
+
"http:#{encoded_image_url}"
|
16
|
+
else
|
17
|
+
encoded_image_url
|
18
|
+
end
|
13
19
|
end
|
14
20
|
|
15
21
|
def encoded_image_url
|
@@ -32,7 +38,11 @@ module Utils
|
|
32
38
|
@parsed_image_url ||= URI.parse(encoded_image_url)
|
33
39
|
end
|
34
40
|
|
35
|
-
def
|
41
|
+
def scheme_missing?
|
42
|
+
parsed_image_url.scheme.to_s.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
def host_missing?
|
36
46
|
!parsed_image_url.host
|
37
47
|
end
|
38
48
|
end
|
@@ -1,10 +1,11 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
module Extractor
|
3
3
|
class Base
|
4
|
-
attr_reader :parsed_body, :link_data
|
4
|
+
attr_reader :parsed_body, :url, :link_data
|
5
5
|
|
6
|
-
def initialize(
|
7
|
-
@parsed_body =
|
6
|
+
def initialize(parsed_url)
|
7
|
+
@parsed_body = parsed_url[:parsed_data]
|
8
|
+
@url = parsed_url[:url]
|
8
9
|
@link_data = LinkData::Data.new
|
9
10
|
end
|
10
11
|
|
@@ -1,11 +1,8 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
module Extractor
|
3
|
-
class Body
|
4
|
-
|
5
|
-
|
6
|
-
def initialize(parsed_body)
|
7
|
-
@parsed_body = parsed_body
|
8
|
-
@link_data = LinkData::Data.new
|
3
|
+
class Body < Base
|
4
|
+
def type
|
5
|
+
:body
|
9
6
|
end
|
10
7
|
|
11
8
|
def perform
|
@@ -14,7 +11,6 @@ class LinkOracle
|
|
14
11
|
image_urls: images,
|
15
12
|
descriptions: descriptions
|
16
13
|
})
|
17
|
-
|
18
14
|
end
|
19
15
|
|
20
16
|
def titles
|
@@ -24,9 +20,30 @@ class LinkOracle
|
|
24
20
|
end
|
25
21
|
|
26
22
|
def images
|
27
|
-
@images ||=
|
28
|
-
|
29
|
-
|
23
|
+
@images ||= valid_size_images
|
24
|
+
end
|
25
|
+
|
26
|
+
def parsed_images
|
27
|
+
@parsed_images ||= parsed_body.xpath(
|
28
|
+
"//img[@src[(contains(.,'://') or contains(., '/')) and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?') or contains(.,'.gif'))]]"
|
29
|
+
).map{ |node| node['src'] }
|
30
|
+
end
|
31
|
+
|
32
|
+
def formatted_images
|
33
|
+
parsed_images.map { |image_url| ::Utils::ImageUrlFormatter.new(url, image_url).perform }
|
34
|
+
end
|
35
|
+
|
36
|
+
def valid_size_images
|
37
|
+
formatted_images.select do |image|
|
38
|
+
size = image_size(image)
|
39
|
+
size[0] >= 100 && size[1] >= 100 if size
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def image_size(image)
|
44
|
+
::FastImage.size(image)
|
45
|
+
rescue ::URI::InvalidURIError
|
46
|
+
[0, 0]
|
30
47
|
end
|
31
48
|
|
32
49
|
def descriptions
|
@@ -34,4 +51,4 @@ class LinkOracle
|
|
34
51
|
end
|
35
52
|
end
|
36
53
|
end
|
37
|
-
end
|
54
|
+
end
|
@@ -1,10 +1,9 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
class LinkData
|
3
|
-
attr_reader :
|
3
|
+
attr_reader :parsed_url
|
4
4
|
|
5
5
|
def initialize(parsed_url)
|
6
|
-
@
|
7
|
-
@url = parsed_url[:url]
|
6
|
+
@parsed_url = parsed_url
|
8
7
|
end
|
9
8
|
|
10
9
|
#TODO: Need to write tests for these
|
@@ -17,19 +16,19 @@ class LinkOracle
|
|
17
16
|
end
|
18
17
|
|
19
18
|
def image_url
|
20
|
-
|
19
|
+
og.image_url || meta.image_url || body.image_url
|
21
20
|
end
|
22
21
|
|
23
22
|
def og
|
24
|
-
@og ||= Extractor::OG.new(
|
23
|
+
@og ||= Extractor::OG.new(parsed_url).perform
|
25
24
|
end
|
26
25
|
|
27
26
|
def meta
|
28
|
-
@meta ||= Extractor::Meta.new(
|
27
|
+
@meta ||= Extractor::Meta.new(parsed_url).perform
|
29
28
|
end
|
30
29
|
|
31
30
|
def body
|
32
|
-
@body ||= Extractor::Body.new(
|
31
|
+
@body ||= Extractor::Body.new(parsed_url).perform
|
33
32
|
end
|
34
33
|
end
|
35
34
|
end
|
data/lib/link_oracle/version.rb
CHANGED
data/link_oracle.gemspec
CHANGED
@@ -7,7 +7,7 @@ require 'link_oracle/version'
|
|
7
7
|
Gem::Specification.new do |spec|
|
8
8
|
spec.name = "link_oracle"
|
9
9
|
spec.version = LinkOracle::VERSION
|
10
|
-
spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri']
|
10
|
+
spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri', 'Rae Bonfanti', 'Zoe Madden-Wood']
|
11
11
|
spec.email = ["developers@socialchorus.com"]
|
12
12
|
spec.description = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
|
13
13
|
spec.summary = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
|
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_dependency 'nokogiri'
|
23
23
|
spec.add_dependency 'curb'
|
24
|
+
spec.add_dependency 'fastimage'
|
24
25
|
|
25
26
|
spec.add_development_dependency "bundler", "~> 1.3"
|
26
27
|
spec.add_development_dependency "rake"
|
@@ -2,19 +2,31 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Utils::ImageUrlFormatter do
|
4
4
|
let(:url) { "http://berkin.com/whatever/else/is/here" }
|
5
|
-
let(:image_url) { "/some/stupid/path" }
|
6
5
|
let(:formatted_url) { Utils::ImageUrlFormatter.new(url, image_url).perform }
|
7
6
|
|
8
|
-
context '
|
9
|
-
|
10
|
-
|
7
|
+
context 'the host is missing from the image url' do
|
8
|
+
let(:image_url) { "/some/stupid/path" }
|
9
|
+
|
10
|
+
context 'scheme is http' do
|
11
|
+
it 'should return the image as a full url using the host as domain' do
|
12
|
+
expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context 'scheme is https' do
|
17
|
+
let(:url) { "https://berkin.com/whatever/else/is/here" }
|
18
|
+
|
19
|
+
it 'should return the image as a full url using the host as domain' do
|
20
|
+
expect(formatted_url).to eq('https://berkin.com/some/stupid/path')
|
21
|
+
end
|
11
22
|
end
|
12
23
|
end
|
13
24
|
|
14
|
-
context 'scheme is
|
15
|
-
let(:
|
16
|
-
|
17
|
-
|
25
|
+
context 'but the host is present, but the scheme is missing' do
|
26
|
+
let(:image_url) { "//berkin.com/some/stupid/path" }
|
27
|
+
|
28
|
+
it 'should return the image as a full url using http as the protocol' do
|
29
|
+
expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
|
18
30
|
end
|
19
31
|
end
|
20
32
|
|
@@ -27,7 +39,7 @@ describe Utils::ImageUrlFormatter do
|
|
27
39
|
end
|
28
40
|
|
29
41
|
it 'returns nil' do
|
30
|
-
formatted_url.
|
42
|
+
expect(formatted_url).to be_nil
|
31
43
|
end
|
32
44
|
end
|
33
45
|
end
|
@@ -2,7 +2,7 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe LinkOracle::Extractor::Body do
|
4
4
|
let(:parsed_body) { ::Nokogiri::HTML.parse(body) }
|
5
|
-
let(:link_data) { LinkOracle::Extractor::Body.new(parsed_body).perform }
|
5
|
+
let(:link_data) { LinkOracle::Extractor::Body.new({ url: 'http://berkin.com', parsed_data: parsed_body}).perform }
|
6
6
|
|
7
7
|
let(:body) {
|
8
8
|
<<-HTML
|
@@ -34,6 +34,10 @@ describe LinkOracle::Extractor::Body do
|
|
34
34
|
}
|
35
35
|
|
36
36
|
describe 'perform' do
|
37
|
+
before do
|
38
|
+
FastImage.stub(:size).and_return([0, 0])
|
39
|
+
end
|
40
|
+
|
37
41
|
context 'there is no suitable stuff in the body' do
|
38
42
|
let(:body) {
|
39
43
|
"<html>
|
@@ -58,14 +62,6 @@ describe LinkOracle::Extractor::Body do
|
|
58
62
|
]
|
59
63
|
end
|
60
64
|
|
61
|
-
it 'should populate link_data image_urls' do
|
62
|
-
link_data.image_urls.should == [
|
63
|
-
"http://berkin.com",
|
64
|
-
"http://cherbin.com",
|
65
|
-
"http://flerbin.com"
|
66
|
-
]
|
67
|
-
end
|
68
|
-
|
69
65
|
it 'should populate link_data descriptions' do
|
70
66
|
link_data.descriptions.should == [
|
71
67
|
"paragraph 1",
|
@@ -73,6 +69,41 @@ describe LinkOracle::Extractor::Body do
|
|
73
69
|
"paragraph 3"
|
74
70
|
]
|
75
71
|
end
|
72
|
+
|
73
|
+
context 'images are a correct size' do
|
74
|
+
before do
|
75
|
+
FastImage.stub(:size).and_return([100, 121])
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should populate link_data image_urls' do
|
79
|
+
expect(link_data.image_urls).to match_array([
|
80
|
+
"http://berkin.com",
|
81
|
+
"http://cherbin.com",
|
82
|
+
"http://flerbin.com",
|
83
|
+
"http://berkin.com/berkin/cherbin.jpg"
|
84
|
+
])
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context 'images are incorrect size' do
|
89
|
+
it 'should populate link_data image_urls' do
|
90
|
+
expect(link_data.image_urls).to eq([])
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
context 'some images are correct size and some are not' do
|
95
|
+
it 'should populate link_data image_urls only with the correctly sized images' do
|
96
|
+
FastImage.should_receive(:size).with("http://berkin.com").and_return([50, 60])
|
97
|
+
FastImage.should_receive(:size).with("http://flerbin.com").and_return([60, 55])
|
98
|
+
FastImage.should_receive(:size).with("http://cherbin.com").and_return([160, 155])
|
99
|
+
FastImage.should_receive(:size).with("http://berkin.com/berkin/cherbin.jpg").and_return([160, 155])
|
100
|
+
|
101
|
+
expect(link_data.image_urls).to match_array([
|
102
|
+
"http://cherbin.com",
|
103
|
+
"http://berkin.com/berkin/cherbin.jpg"
|
104
|
+
])
|
105
|
+
end
|
106
|
+
end
|
76
107
|
end
|
77
108
|
end
|
78
|
-
end
|
109
|
+
end
|
metadata
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_oracle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Cooper
|
8
8
|
- Fito von Zastrow
|
9
9
|
- Kane Baccigalupi
|
10
10
|
- Sowjanya Mudunuri
|
11
|
+
- Rae Bonfanti
|
12
|
+
- Zoe Madden-Wood
|
11
13
|
autorequire:
|
12
14
|
bindir: bin
|
13
15
|
cert_chain: []
|
14
|
-
date: 2014-
|
16
|
+
date: 2014-07-10 00:00:00.000000000 Z
|
15
17
|
dependencies:
|
16
18
|
- !ruby/object:Gem::Dependency
|
17
19
|
name: nokogiri
|
@@ -41,6 +43,20 @@ dependencies:
|
|
41
43
|
- - '>='
|
42
44
|
- !ruby/object:Gem::Version
|
43
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: fastimage
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
type: :runtime
|
54
|
+
prerelease: false
|
55
|
+
version_requirements: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
44
60
|
- !ruby/object:Gem::Dependency
|
45
61
|
name: bundler
|
46
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -147,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
147
163
|
version: '0'
|
148
164
|
requirements: []
|
149
165
|
rubyforge_project:
|
150
|
-
rubygems_version: 2.
|
166
|
+
rubygems_version: 2.2.2
|
151
167
|
signing_key:
|
152
168
|
specification_version: 4
|
153
169
|
summary: Scrapes pages for open graph, meta, and lastly, body preview data
|