link_oracle 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/link_oracle.rb +1 -0
- data/lib/link_oracle/Utils/image_url_formatter.rb +12 -2
- data/lib/link_oracle/extractor/base.rb +4 -3
- data/lib/link_oracle/extractor/body.rb +28 -11
- data/lib/link_oracle/link_data.rb +6 -7
- data/lib/link_oracle/link_data/data.rb +4 -0
- data/lib/link_oracle/version.rb +1 -1
- data/link_oracle.gemspec +2 -1
- data/spec/link_oracle/Utils/image_url_formatter_spec.rb +21 -9
- data/spec/link_oracle/extractor/body_spec.rb +41 -10
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66b779546f8f2c837e044e40d161365baf2ca004
|
4
|
+
data.tar.gz: 96689ad4ed3ac24d666b098c3ea2d4d9c057eb99
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6649f8b62eba20a02792fea7ac4b7b9018fe73c5a2fb3f8dee7587664cf561b4f7e8b5a54e4fbfa02202e019f7e30a8e414c7d708a7bed751a4cd5a26b3b787c
|
7
|
+
data.tar.gz: f22445e5ea8b082f61ee9c15995d4da57fd05e722f16dfe673e0f114c8f1da6bd9f95412b90025e8154b5c4f8947b9290265842edee4bd0564712ec84e2b60d2
|
data/lib/link_oracle.rb
CHANGED
@@ -9,7 +9,13 @@ module Utils
|
|
9
9
|
|
10
10
|
def perform
|
11
11
|
return unless image_url
|
12
|
-
|
12
|
+
if host_missing?
|
13
|
+
"#{scheme}://#{host}#{encoded_image_url}"
|
14
|
+
elsif scheme_missing?
|
15
|
+
"http:#{encoded_image_url}"
|
16
|
+
else
|
17
|
+
encoded_image_url
|
18
|
+
end
|
13
19
|
end
|
14
20
|
|
15
21
|
def encoded_image_url
|
@@ -32,7 +38,11 @@ module Utils
|
|
32
38
|
@parsed_image_url ||= URI.parse(encoded_image_url)
|
33
39
|
end
|
34
40
|
|
35
|
-
def
|
41
|
+
def scheme_missing?
|
42
|
+
parsed_image_url.scheme.to_s.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
def host_missing?
|
36
46
|
!parsed_image_url.host
|
37
47
|
end
|
38
48
|
end
|
@@ -1,10 +1,11 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
module Extractor
|
3
3
|
class Base
|
4
|
-
attr_reader :parsed_body, :link_data
|
4
|
+
attr_reader :parsed_body, :url, :link_data
|
5
5
|
|
6
|
-
def initialize(
|
7
|
-
@parsed_body =
|
6
|
+
def initialize(parsed_url)
|
7
|
+
@parsed_body = parsed_url[:parsed_data]
|
8
|
+
@url = parsed_url[:url]
|
8
9
|
@link_data = LinkData::Data.new
|
9
10
|
end
|
10
11
|
|
@@ -1,11 +1,8 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
module Extractor
|
3
|
-
class Body
|
4
|
-
|
5
|
-
|
6
|
-
def initialize(parsed_body)
|
7
|
-
@parsed_body = parsed_body
|
8
|
-
@link_data = LinkData::Data.new
|
3
|
+
class Body < Base
|
4
|
+
def type
|
5
|
+
:body
|
9
6
|
end
|
10
7
|
|
11
8
|
def perform
|
@@ -14,7 +11,6 @@ class LinkOracle
|
|
14
11
|
image_urls: images,
|
15
12
|
descriptions: descriptions
|
16
13
|
})
|
17
|
-
|
18
14
|
end
|
19
15
|
|
20
16
|
def titles
|
@@ -24,9 +20,30 @@ class LinkOracle
|
|
24
20
|
end
|
25
21
|
|
26
22
|
def images
|
27
|
-
@images ||=
|
28
|
-
|
29
|
-
|
23
|
+
@images ||= valid_size_images
|
24
|
+
end
|
25
|
+
|
26
|
+
def parsed_images
|
27
|
+
@parsed_images ||= parsed_body.xpath(
|
28
|
+
"//img[@src[(contains(.,'://') or contains(., '/')) and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?') or contains(.,'.gif'))]]"
|
29
|
+
).map{ |node| node['src'] }
|
30
|
+
end
|
31
|
+
|
32
|
+
def formatted_images
|
33
|
+
parsed_images.map { |image_url| ::Utils::ImageUrlFormatter.new(url, image_url).perform }
|
34
|
+
end
|
35
|
+
|
36
|
+
def valid_size_images
|
37
|
+
formatted_images.select do |image|
|
38
|
+
size = image_size(image)
|
39
|
+
size[0] >= 100 && size[1] >= 100 if size
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def image_size(image)
|
44
|
+
::FastImage.size(image)
|
45
|
+
rescue ::URI::InvalidURIError
|
46
|
+
[0, 0]
|
30
47
|
end
|
31
48
|
|
32
49
|
def descriptions
|
@@ -34,4 +51,4 @@ class LinkOracle
|
|
34
51
|
end
|
35
52
|
end
|
36
53
|
end
|
37
|
-
end
|
54
|
+
end
|
@@ -1,10 +1,9 @@
|
|
1
1
|
class LinkOracle
|
2
2
|
class LinkData
|
3
|
-
attr_reader :
|
3
|
+
attr_reader :parsed_url
|
4
4
|
|
5
5
|
def initialize(parsed_url)
|
6
|
-
@
|
7
|
-
@url = parsed_url[:url]
|
6
|
+
@parsed_url = parsed_url
|
8
7
|
end
|
9
8
|
|
10
9
|
#TODO: Need to write tests for these
|
@@ -17,19 +16,19 @@ class LinkOracle
|
|
17
16
|
end
|
18
17
|
|
19
18
|
def image_url
|
20
|
-
|
19
|
+
og.image_url || meta.image_url || body.image_url
|
21
20
|
end
|
22
21
|
|
23
22
|
def og
|
24
|
-
@og ||= Extractor::OG.new(
|
23
|
+
@og ||= Extractor::OG.new(parsed_url).perform
|
25
24
|
end
|
26
25
|
|
27
26
|
def meta
|
28
|
-
@meta ||= Extractor::Meta.new(
|
27
|
+
@meta ||= Extractor::Meta.new(parsed_url).perform
|
29
28
|
end
|
30
29
|
|
31
30
|
def body
|
32
|
-
@body ||= Extractor::Body.new(
|
31
|
+
@body ||= Extractor::Body.new(parsed_url).perform
|
33
32
|
end
|
34
33
|
end
|
35
34
|
end
|
data/lib/link_oracle/version.rb
CHANGED
data/link_oracle.gemspec
CHANGED
@@ -7,7 +7,7 @@ require 'link_oracle/version'
|
|
7
7
|
Gem::Specification.new do |spec|
|
8
8
|
spec.name = "link_oracle"
|
9
9
|
spec.version = LinkOracle::VERSION
|
10
|
-
spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri']
|
10
|
+
spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri', 'Rae Bonfanti', 'Zoe Madden-Wood']
|
11
11
|
spec.email = ["developers@socialchorus.com"]
|
12
12
|
spec.description = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
|
13
13
|
spec.summary = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
|
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_dependency 'nokogiri'
|
23
23
|
spec.add_dependency 'curb'
|
24
|
+
spec.add_dependency 'fastimage'
|
24
25
|
|
25
26
|
spec.add_development_dependency "bundler", "~> 1.3"
|
26
27
|
spec.add_development_dependency "rake"
|
@@ -2,19 +2,31 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Utils::ImageUrlFormatter do
|
4
4
|
let(:url) { "http://berkin.com/whatever/else/is/here" }
|
5
|
-
let(:image_url) { "/some/stupid/path" }
|
6
5
|
let(:formatted_url) { Utils::ImageUrlFormatter.new(url, image_url).perform }
|
7
6
|
|
8
|
-
context '
|
9
|
-
|
10
|
-
|
7
|
+
context 'the host is missing from the image url' do
|
8
|
+
let(:image_url) { "/some/stupid/path" }
|
9
|
+
|
10
|
+
context 'scheme is http' do
|
11
|
+
it 'should return the image as a full url using the host as domain' do
|
12
|
+
expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context 'scheme is https' do
|
17
|
+
let(:url) { "https://berkin.com/whatever/else/is/here" }
|
18
|
+
|
19
|
+
it 'should return the image as a full url using the host as domain' do
|
20
|
+
expect(formatted_url).to eq('https://berkin.com/some/stupid/path')
|
21
|
+
end
|
11
22
|
end
|
12
23
|
end
|
13
24
|
|
14
|
-
context 'scheme is
|
15
|
-
let(:
|
16
|
-
|
17
|
-
|
25
|
+
context 'but the host is present, but the scheme is missing' do
|
26
|
+
let(:image_url) { "//berkin.com/some/stupid/path" }
|
27
|
+
|
28
|
+
it 'should return the image as a full url using http as the protocol' do
|
29
|
+
expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
|
18
30
|
end
|
19
31
|
end
|
20
32
|
|
@@ -27,7 +39,7 @@ describe Utils::ImageUrlFormatter do
|
|
27
39
|
end
|
28
40
|
|
29
41
|
it 'returns nil' do
|
30
|
-
formatted_url.
|
42
|
+
expect(formatted_url).to be_nil
|
31
43
|
end
|
32
44
|
end
|
33
45
|
end
|
@@ -2,7 +2,7 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe LinkOracle::Extractor::Body do
|
4
4
|
let(:parsed_body) { ::Nokogiri::HTML.parse(body) }
|
5
|
-
let(:link_data) { LinkOracle::Extractor::Body.new(parsed_body).perform }
|
5
|
+
let(:link_data) { LinkOracle::Extractor::Body.new({ url: 'http://berkin.com', parsed_data: parsed_body}).perform }
|
6
6
|
|
7
7
|
let(:body) {
|
8
8
|
<<-HTML
|
@@ -34,6 +34,10 @@ describe LinkOracle::Extractor::Body do
|
|
34
34
|
}
|
35
35
|
|
36
36
|
describe 'perform' do
|
37
|
+
before do
|
38
|
+
FastImage.stub(:size).and_return([0, 0])
|
39
|
+
end
|
40
|
+
|
37
41
|
context 'there is no suitable stuff in the body' do
|
38
42
|
let(:body) {
|
39
43
|
"<html>
|
@@ -58,14 +62,6 @@ describe LinkOracle::Extractor::Body do
|
|
58
62
|
]
|
59
63
|
end
|
60
64
|
|
61
|
-
it 'should populate link_data image_urls' do
|
62
|
-
link_data.image_urls.should == [
|
63
|
-
"http://berkin.com",
|
64
|
-
"http://cherbin.com",
|
65
|
-
"http://flerbin.com"
|
66
|
-
]
|
67
|
-
end
|
68
|
-
|
69
65
|
it 'should populate link_data descriptions' do
|
70
66
|
link_data.descriptions.should == [
|
71
67
|
"paragraph 1",
|
@@ -73,6 +69,41 @@ describe LinkOracle::Extractor::Body do
|
|
73
69
|
"paragraph 3"
|
74
70
|
]
|
75
71
|
end
|
72
|
+
|
73
|
+
context 'images are a correct size' do
|
74
|
+
before do
|
75
|
+
FastImage.stub(:size).and_return([100, 121])
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should populate link_data image_urls' do
|
79
|
+
expect(link_data.image_urls).to match_array([
|
80
|
+
"http://berkin.com",
|
81
|
+
"http://cherbin.com",
|
82
|
+
"http://flerbin.com",
|
83
|
+
"http://berkin.com/berkin/cherbin.jpg"
|
84
|
+
])
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context 'images are incorrect size' do
|
89
|
+
it 'should populate link_data image_urls' do
|
90
|
+
expect(link_data.image_urls).to eq([])
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
context 'some images are correct size and some are not' do
|
95
|
+
it 'should populate link_data image_urls only with the correctly sized images' do
|
96
|
+
FastImage.should_receive(:size).with("http://berkin.com").and_return([50, 60])
|
97
|
+
FastImage.should_receive(:size).with("http://flerbin.com").and_return([60, 55])
|
98
|
+
FastImage.should_receive(:size).with("http://cherbin.com").and_return([160, 155])
|
99
|
+
FastImage.should_receive(:size).with("http://berkin.com/berkin/cherbin.jpg").and_return([160, 155])
|
100
|
+
|
101
|
+
expect(link_data.image_urls).to match_array([
|
102
|
+
"http://cherbin.com",
|
103
|
+
"http://berkin.com/berkin/cherbin.jpg"
|
104
|
+
])
|
105
|
+
end
|
106
|
+
end
|
76
107
|
end
|
77
108
|
end
|
78
|
-
end
|
109
|
+
end
|
metadata
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_oracle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Cooper
|
8
8
|
- Fito von Zastrow
|
9
9
|
- Kane Baccigalupi
|
10
10
|
- Sowjanya Mudunuri
|
11
|
+
- Rae Bonfanti
|
12
|
+
- Zoe Madden-Wood
|
11
13
|
autorequire:
|
12
14
|
bindir: bin
|
13
15
|
cert_chain: []
|
14
|
-
date: 2014-
|
16
|
+
date: 2014-07-10 00:00:00.000000000 Z
|
15
17
|
dependencies:
|
16
18
|
- !ruby/object:Gem::Dependency
|
17
19
|
name: nokogiri
|
@@ -41,6 +43,20 @@ dependencies:
|
|
41
43
|
- - '>='
|
42
44
|
- !ruby/object:Gem::Version
|
43
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: fastimage
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
type: :runtime
|
54
|
+
prerelease: false
|
55
|
+
version_requirements: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
44
60
|
- !ruby/object:Gem::Dependency
|
45
61
|
name: bundler
|
46
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -147,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
147
163
|
version: '0'
|
148
164
|
requirements: []
|
149
165
|
rubyforge_project:
|
150
|
-
rubygems_version: 2.
|
166
|
+
rubygems_version: 2.2.2
|
151
167
|
signing_key:
|
152
168
|
specification_version: 4
|
153
169
|
summary: Scrapes pages for open graph, meta, and lastly, body preview data
|