link_oracle 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3be685ce4c88a7b4bb145386e83a7abe75182002
4
- data.tar.gz: 8f9f0067936d8d8b330c8d44d2bd302fa18ec10e
3
+ metadata.gz: 66b779546f8f2c837e044e40d161365baf2ca004
4
+ data.tar.gz: 96689ad4ed3ac24d666b098c3ea2d4d9c057eb99
5
5
  SHA512:
6
- metadata.gz: a4b7e2c94808c71a5140aa96b57046f58d6cc4b04be79acb27f7ba7a718034b81ccaa60755d7c0ff391feaca4d88968e349b89fa19620a5f688a395aa1bc4f9d
7
- data.tar.gz: ac2fafb2410fc9c2cb0b979f27b0f2f3b2d3e7a55ca24a53b8700b725039b0c4b1b2e8cc9bf4d1f5db7bdaa67d85133151ffadeda95f26d62de457a602ccbdbb
6
+ metadata.gz: 6649f8b62eba20a02792fea7ac4b7b9018fe73c5a2fb3f8dee7587664cf561b4f7e8b5a54e4fbfa02202e019f7e30a8e414c7d708a7bed751a4cd5a26b3b787c
7
+ data.tar.gz: f22445e5ea8b082f61ee9c15995d4da57fd05e722f16dfe673e0f114c8f1da6bd9f95412b90025e8154b5c4f8947b9290265842edee4bd0564712ec84e2b60d2
data/lib/link_oracle.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'curb'
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
+ require 'fastimage'
4
5
 
5
6
  require 'link_oracle/extractor/base'
6
7
  require 'link_oracle/request'
@@ -9,7 +9,13 @@ module Utils
9
9
 
10
10
  def perform
11
11
  return unless image_url
12
- invalid_url? ? "#{scheme}://#{host}#{encoded_image_url}" : encoded_image_url
12
+ if host_missing?
13
+ "#{scheme}://#{host}#{encoded_image_url}"
14
+ elsif scheme_missing?
15
+ "http:#{encoded_image_url}"
16
+ else
17
+ encoded_image_url
18
+ end
13
19
  end
14
20
 
15
21
  def encoded_image_url
@@ -32,7 +38,11 @@ module Utils
32
38
  @parsed_image_url ||= URI.parse(encoded_image_url)
33
39
  end
34
40
 
35
- def invalid_url?
41
+ def scheme_missing?
42
+ parsed_image_url.scheme.to_s.empty?
43
+ end
44
+
45
+ def host_missing?
36
46
  !parsed_image_url.host
37
47
  end
38
48
  end
@@ -1,10 +1,11 @@
1
1
  class LinkOracle
2
2
  module Extractor
3
3
  class Base
4
- attr_reader :parsed_body, :link_data
4
+ attr_reader :parsed_body, :url, :link_data
5
5
 
6
- def initialize(parsed_body)
7
- @parsed_body = parsed_body
6
+ def initialize(parsed_url)
7
+ @parsed_body = parsed_url[:parsed_data]
8
+ @url = parsed_url[:url]
8
9
  @link_data = LinkData::Data.new
9
10
  end
10
11
 
@@ -1,11 +1,8 @@
1
1
  class LinkOracle
2
2
  module Extractor
3
- class Body
4
- attr_reader :parsed_body, :link_data
5
-
6
- def initialize(parsed_body)
7
- @parsed_body = parsed_body
8
- @link_data = LinkData::Data.new
3
+ class Body < Base
4
+ def type
5
+ :body
9
6
  end
10
7
 
11
8
  def perform
@@ -14,7 +11,6 @@ class LinkOracle
14
11
  image_urls: images,
15
12
  descriptions: descriptions
16
13
  })
17
-
18
14
  end
19
15
 
20
16
  def titles
@@ -24,9 +20,30 @@ class LinkOracle
24
20
  end
25
21
 
26
22
  def images
27
- @images ||= parsed_body.xpath(
28
- "//img[@src[contains(.,'://') and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?'))]]"
29
- ).first(3).compact.map{ |node| node['src'] }
23
+ @images ||= valid_size_images
24
+ end
25
+
26
+ def parsed_images
27
+ @parsed_images ||= parsed_body.xpath(
28
+ "//img[@src[(contains(.,'://') or contains(., '/')) and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?') or contains(.,'.gif'))]]"
29
+ ).map{ |node| node['src'] }
30
+ end
31
+
32
+ def formatted_images
33
+ parsed_images.map { |image_url| ::Utils::ImageUrlFormatter.new(url, image_url).perform }
34
+ end
35
+
36
+ def valid_size_images
37
+ formatted_images.select do |image|
38
+ size = image_size(image)
39
+ size[0] >= 100 && size[1] >= 100 if size
40
+ end
41
+ end
42
+
43
+ def image_size(image)
44
+ ::FastImage.size(image)
45
+ rescue ::URI::InvalidURIError
46
+ [0, 0]
30
47
  end
31
48
 
32
49
  def descriptions
@@ -34,4 +51,4 @@ class LinkOracle
34
51
  end
35
52
  end
36
53
  end
37
- end
54
+ end
@@ -1,10 +1,9 @@
1
1
  class LinkOracle
2
2
  class LinkData
3
- attr_reader :parsed_data, :url
3
+ attr_reader :parsed_url
4
4
 
5
5
  def initialize(parsed_url)
6
- @parsed_data = parsed_url[:parsed_data]
7
- @url = parsed_url[:url]
6
+ @parsed_url = parsed_url
8
7
  end
9
8
 
10
9
  #TODO: Need to write tests for these
@@ -17,19 +16,19 @@ class LinkOracle
17
16
  end
18
17
 
19
18
  def image_url
20
- Utils::ImageUrlFormatter.new(url, og.image_url || meta.image_url || body.image_url).perform
19
+ og.image_url || meta.image_url || body.image_url
21
20
  end
22
21
 
23
22
  def og
24
- @og ||= Extractor::OG.new(parsed_data).perform
23
+ @og ||= Extractor::OG.new(parsed_url).perform
25
24
  end
26
25
 
27
26
  def meta
28
- @meta ||= Extractor::Meta.new(parsed_data).perform
27
+ @meta ||= Extractor::Meta.new(parsed_url).perform
29
28
  end
30
29
 
31
30
  def body
32
- @body ||= Extractor::Body.new(parsed_data).perform
31
+ @body ||= Extractor::Body.new(parsed_url).perform
33
32
  end
34
33
  end
35
34
  end
@@ -12,6 +12,10 @@ class LinkOracle
12
12
  image_urls.first
13
13
  end
14
14
 
15
+ def format_image(image)
16
+ Utils::ImageUrlFormatter.new(url, image)
17
+ end
18
+
15
19
  def title
16
20
  titles.first
17
21
  end
@@ -1,3 +1,3 @@
1
1
  class LinkOracle
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
data/link_oracle.gemspec CHANGED
@@ -7,7 +7,7 @@ require 'link_oracle/version'
7
7
  Gem::Specification.new do |spec|
8
8
  spec.name = "link_oracle"
9
9
  spec.version = LinkOracle::VERSION
10
- spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri']
10
+ spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri', 'Rae Bonfanti', 'Zoe Madden-Wood']
11
11
  spec.email = ["developers@socialchorus.com"]
12
12
  spec.description = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
13
13
  spec.summary = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_dependency 'nokogiri'
23
23
  spec.add_dependency 'curb'
24
+ spec.add_dependency 'fastimage'
24
25
 
25
26
  spec.add_development_dependency "bundler", "~> 1.3"
26
27
  spec.add_development_dependency "rake"
@@ -2,19 +2,31 @@ require 'spec_helper'
2
2
 
3
3
  describe Utils::ImageUrlFormatter do
4
4
  let(:url) { "http://berkin.com/whatever/else/is/here" }
5
- let(:image_url) { "/some/stupid/path" }
6
5
  let(:formatted_url) { Utils::ImageUrlFormatter.new(url, image_url).perform }
7
6
 
8
- context 'scheme is http' do
9
- it 'should return the image as a full url using the host as domain' do
10
- formatted_url.should == 'http://berkin.com/some/stupid/path'
7
+ context 'the host is missing from the image url' do
8
+ let(:image_url) { "/some/stupid/path" }
9
+
10
+ context 'scheme is http' do
11
+ it 'should return the image as a full url using the host as domain' do
12
+ expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
13
+ end
14
+ end
15
+
16
+ context 'scheme is https' do
17
+ let(:url) { "https://berkin.com/whatever/else/is/here" }
18
+
19
+ it 'should return the image as a full url using the host as domain' do
20
+ expect(formatted_url).to eq('https://berkin.com/some/stupid/path')
21
+ end
11
22
  end
12
23
  end
13
24
 
14
- context 'scheme is https' do
15
- let(:url) { "https://berkin.com/whatever/else/is/here" }
16
- it 'should return the image as a full url using the host as domain' do
17
- formatted_url.should == 'https://berkin.com/some/stupid/path'
25
+ context 'but the host is present, but the scheme is missing' do
26
+ let(:image_url) { "//berkin.com/some/stupid/path" }
27
+
28
+ it 'should return the image as a full url using http as the protocol' do
29
+ expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
18
30
  end
19
31
  end
20
32
 
@@ -27,7 +39,7 @@ describe Utils::ImageUrlFormatter do
27
39
  end
28
40
 
29
41
  it 'returns nil' do
30
- formatted_url.should == nil
42
+ expect(formatted_url).to be_nil
31
43
  end
32
44
  end
33
45
  end
@@ -2,7 +2,7 @@ require 'spec_helper'
2
2
 
3
3
  describe LinkOracle::Extractor::Body do
4
4
  let(:parsed_body) { ::Nokogiri::HTML.parse(body) }
5
- let(:link_data) { LinkOracle::Extractor::Body.new(parsed_body).perform }
5
+ let(:link_data) { LinkOracle::Extractor::Body.new({ url: 'http://berkin.com', parsed_data: parsed_body}).perform }
6
6
 
7
7
  let(:body) {
8
8
  <<-HTML
@@ -34,6 +34,10 @@ describe LinkOracle::Extractor::Body do
34
34
  }
35
35
 
36
36
  describe 'perform' do
37
+ before do
38
+ FastImage.stub(:size).and_return([0, 0])
39
+ end
40
+
37
41
  context 'there is no suitable stuff in the body' do
38
42
  let(:body) {
39
43
  "<html>
@@ -58,14 +62,6 @@ describe LinkOracle::Extractor::Body do
58
62
  ]
59
63
  end
60
64
 
61
- it 'should populate link_data image_urls' do
62
- link_data.image_urls.should == [
63
- "http://berkin.com",
64
- "http://cherbin.com",
65
- "http://flerbin.com"
66
- ]
67
- end
68
-
69
65
  it 'should populate link_data descriptions' do
70
66
  link_data.descriptions.should == [
71
67
  "paragraph 1",
@@ -73,6 +69,41 @@ describe LinkOracle::Extractor::Body do
73
69
  "paragraph 3"
74
70
  ]
75
71
  end
72
+
73
+ context 'images are a correct size' do
74
+ before do
75
+ FastImage.stub(:size).and_return([100, 121])
76
+ end
77
+
78
+ it 'should populate link_data image_urls' do
79
+ expect(link_data.image_urls).to match_array([
80
+ "http://berkin.com",
81
+ "http://cherbin.com",
82
+ "http://flerbin.com",
83
+ "http://berkin.com/berkin/cherbin.jpg"
84
+ ])
85
+ end
86
+ end
87
+
88
+ context 'images are incorrect size' do
89
+ it 'should populate link_data image_urls' do
90
+ expect(link_data.image_urls).to eq([])
91
+ end
92
+ end
93
+
94
+ context 'some images are correct size and some are not' do
95
+ it 'should populate link_data image_urls only with the correctly sized images' do
96
+ FastImage.should_receive(:size).with("http://berkin.com").and_return([50, 60])
97
+ FastImage.should_receive(:size).with("http://flerbin.com").and_return([60, 55])
98
+ FastImage.should_receive(:size).with("http://cherbin.com").and_return([160, 155])
99
+ FastImage.should_receive(:size).with("http://berkin.com/berkin/cherbin.jpg").and_return([160, 155])
100
+
101
+ expect(link_data.image_urls).to match_array([
102
+ "http://cherbin.com",
103
+ "http://berkin.com/berkin/cherbin.jpg"
104
+ ])
105
+ end
106
+ end
76
107
  end
77
108
  end
78
- end
109
+ end
metadata CHANGED
@@ -1,17 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_oracle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Cooper
8
8
  - Fito von Zastrow
9
9
  - Kane Baccigalupi
10
10
  - Sowjanya Mudunuri
11
+ - Rae Bonfanti
12
+ - Zoe Madden-Wood
11
13
  autorequire:
12
14
  bindir: bin
13
15
  cert_chain: []
14
- date: 2014-06-26 00:00:00.000000000 Z
16
+ date: 2014-07-10 00:00:00.000000000 Z
15
17
  dependencies:
16
18
  - !ruby/object:Gem::Dependency
17
19
  name: nokogiri
@@ -41,6 +43,20 @@ dependencies:
41
43
  - - '>='
42
44
  - !ruby/object:Gem::Version
43
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: fastimage
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ type: :runtime
54
+ prerelease: false
55
+ version_requirements: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
44
60
  - !ruby/object:Gem::Dependency
45
61
  name: bundler
46
62
  requirement: !ruby/object:Gem::Requirement
@@ -147,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
147
163
  version: '0'
148
164
  requirements: []
149
165
  rubyforge_project:
150
- rubygems_version: 2.3.0
166
+ rubygems_version: 2.2.2
151
167
  signing_key:
152
168
  specification_version: 4
153
169
  summary: Scrapes pages for open graph, meta, and lastly, body preview data