link_oracle 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3be685ce4c88a7b4bb145386e83a7abe75182002
4
- data.tar.gz: 8f9f0067936d8d8b330c8d44d2bd302fa18ec10e
3
+ metadata.gz: 66b779546f8f2c837e044e40d161365baf2ca004
4
+ data.tar.gz: 96689ad4ed3ac24d666b098c3ea2d4d9c057eb99
5
5
  SHA512:
6
- metadata.gz: a4b7e2c94808c71a5140aa96b57046f58d6cc4b04be79acb27f7ba7a718034b81ccaa60755d7c0ff391feaca4d88968e349b89fa19620a5f688a395aa1bc4f9d
7
- data.tar.gz: ac2fafb2410fc9c2cb0b979f27b0f2f3b2d3e7a55ca24a53b8700b725039b0c4b1b2e8cc9bf4d1f5db7bdaa67d85133151ffadeda95f26d62de457a602ccbdbb
6
+ metadata.gz: 6649f8b62eba20a02792fea7ac4b7b9018fe73c5a2fb3f8dee7587664cf561b4f7e8b5a54e4fbfa02202e019f7e30a8e414c7d708a7bed751a4cd5a26b3b787c
7
+ data.tar.gz: f22445e5ea8b082f61ee9c15995d4da57fd05e722f16dfe673e0f114c8f1da6bd9f95412b90025e8154b5c4f8947b9290265842edee4bd0564712ec84e2b60d2
data/lib/link_oracle.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'curb'
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
+ require 'fastimage'
4
5
 
5
6
  require 'link_oracle/extractor/base'
6
7
  require 'link_oracle/request'
@@ -9,7 +9,13 @@ module Utils
9
9
 
10
10
  def perform
11
11
  return unless image_url
12
- invalid_url? ? "#{scheme}://#{host}#{encoded_image_url}" : encoded_image_url
12
+ if host_missing?
13
+ "#{scheme}://#{host}#{encoded_image_url}"
14
+ elsif scheme_missing?
15
+ "http:#{encoded_image_url}"
16
+ else
17
+ encoded_image_url
18
+ end
13
19
  end
14
20
 
15
21
  def encoded_image_url
@@ -32,7 +38,11 @@ module Utils
32
38
  @parsed_image_url ||= URI.parse(encoded_image_url)
33
39
  end
34
40
 
35
- def invalid_url?
41
+ def scheme_missing?
42
+ parsed_image_url.scheme.to_s.empty?
43
+ end
44
+
45
+ def host_missing?
36
46
  !parsed_image_url.host
37
47
  end
38
48
  end
@@ -1,10 +1,11 @@
1
1
  class LinkOracle
2
2
  module Extractor
3
3
  class Base
4
- attr_reader :parsed_body, :link_data
4
+ attr_reader :parsed_body, :url, :link_data
5
5
 
6
- def initialize(parsed_body)
7
- @parsed_body = parsed_body
6
+ def initialize(parsed_url)
7
+ @parsed_body = parsed_url[:parsed_data]
8
+ @url = parsed_url[:url]
8
9
  @link_data = LinkData::Data.new
9
10
  end
10
11
 
@@ -1,11 +1,8 @@
1
1
  class LinkOracle
2
2
  module Extractor
3
- class Body
4
- attr_reader :parsed_body, :link_data
5
-
6
- def initialize(parsed_body)
7
- @parsed_body = parsed_body
8
- @link_data = LinkData::Data.new
3
+ class Body < Base
4
+ def type
5
+ :body
9
6
  end
10
7
 
11
8
  def perform
@@ -14,7 +11,6 @@ class LinkOracle
14
11
  image_urls: images,
15
12
  descriptions: descriptions
16
13
  })
17
-
18
14
  end
19
15
 
20
16
  def titles
@@ -24,9 +20,30 @@ class LinkOracle
24
20
  end
25
21
 
26
22
  def images
27
- @images ||= parsed_body.xpath(
28
- "//img[@src[contains(.,'://') and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?'))]]"
29
- ).first(3).compact.map{ |node| node['src'] }
23
+ @images ||= valid_size_images
24
+ end
25
+
26
+ def parsed_images
27
+ @parsed_images ||= parsed_body.xpath(
28
+ "//img[@src[(contains(.,'://') or contains(., '/')) and not(contains(.,'ads.') or contains(.,'ad.') or contains(.,'?') or contains(.,'.gif'))]]"
29
+ ).map{ |node| node['src'] }
30
+ end
31
+
32
+ def formatted_images
33
+ parsed_images.map { |image_url| ::Utils::ImageUrlFormatter.new(url, image_url).perform }
34
+ end
35
+
36
+ def valid_size_images
37
+ formatted_images.select do |image|
38
+ size = image_size(image)
39
+ size[0] >= 100 && size[1] >= 100 if size
40
+ end
41
+ end
42
+
43
+ def image_size(image)
44
+ ::FastImage.size(image)
45
+ rescue ::URI::InvalidURIError
46
+ [0, 0]
30
47
  end
31
48
 
32
49
  def descriptions
@@ -34,4 +51,4 @@ class LinkOracle
34
51
  end
35
52
  end
36
53
  end
37
- end
54
+ end
@@ -1,10 +1,9 @@
1
1
  class LinkOracle
2
2
  class LinkData
3
- attr_reader :parsed_data, :url
3
+ attr_reader :parsed_url
4
4
 
5
5
  def initialize(parsed_url)
6
- @parsed_data = parsed_url[:parsed_data]
7
- @url = parsed_url[:url]
6
+ @parsed_url = parsed_url
8
7
  end
9
8
 
10
9
  #TODO: Need to write tests for these
@@ -17,19 +16,19 @@ class LinkOracle
17
16
  end
18
17
 
19
18
  def image_url
20
- Utils::ImageUrlFormatter.new(url, og.image_url || meta.image_url || body.image_url).perform
19
+ og.image_url || meta.image_url || body.image_url
21
20
  end
22
21
 
23
22
  def og
24
- @og ||= Extractor::OG.new(parsed_data).perform
23
+ @og ||= Extractor::OG.new(parsed_url).perform
25
24
  end
26
25
 
27
26
  def meta
28
- @meta ||= Extractor::Meta.new(parsed_data).perform
27
+ @meta ||= Extractor::Meta.new(parsed_url).perform
29
28
  end
30
29
 
31
30
  def body
32
- @body ||= Extractor::Body.new(parsed_data).perform
31
+ @body ||= Extractor::Body.new(parsed_url).perform
33
32
  end
34
33
  end
35
34
  end
@@ -12,6 +12,10 @@ class LinkOracle
12
12
  image_urls.first
13
13
  end
14
14
 
15
+ def format_image(image)
16
+ Utils::ImageUrlFormatter.new(url, image)
17
+ end
18
+
15
19
  def title
16
20
  titles.first
17
21
  end
@@ -1,3 +1,3 @@
1
1
  class LinkOracle
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
data/link_oracle.gemspec CHANGED
@@ -7,7 +7,7 @@ require 'link_oracle/version'
7
7
  Gem::Specification.new do |spec|
8
8
  spec.name = "link_oracle"
9
9
  spec.version = LinkOracle::VERSION
10
- spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri']
10
+ spec.authors = ["Ian Cooper", 'Fito von Zastrow', 'Kane Baccigalupi', 'Sowjanya Mudunuri', 'Rae Bonfanti', 'Zoe Madden-Wood']
11
11
  spec.email = ["developers@socialchorus.com"]
12
12
  spec.description = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
13
13
  spec.summary = %q{Scrapes pages for open graph, meta, and lastly, body preview data}
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_dependency 'nokogiri'
23
23
  spec.add_dependency 'curb'
24
+ spec.add_dependency 'fastimage'
24
25
 
25
26
  spec.add_development_dependency "bundler", "~> 1.3"
26
27
  spec.add_development_dependency "rake"
@@ -2,19 +2,31 @@ require 'spec_helper'
2
2
 
3
3
  describe Utils::ImageUrlFormatter do
4
4
  let(:url) { "http://berkin.com/whatever/else/is/here" }
5
- let(:image_url) { "/some/stupid/path" }
6
5
  let(:formatted_url) { Utils::ImageUrlFormatter.new(url, image_url).perform }
7
6
 
8
- context 'scheme is http' do
9
- it 'should return the image as a full url using the host as domain' do
10
- formatted_url.should == 'http://berkin.com/some/stupid/path'
7
+ context 'the host is missing from the image url' do
8
+ let(:image_url) { "/some/stupid/path" }
9
+
10
+ context 'scheme is http' do
11
+ it 'should return the image as a full url using the host as domain' do
12
+ expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
13
+ end
14
+ end
15
+
16
+ context 'scheme is https' do
17
+ let(:url) { "https://berkin.com/whatever/else/is/here" }
18
+
19
+ it 'should return the image as a full url using the host as domain' do
20
+ expect(formatted_url).to eq('https://berkin.com/some/stupid/path')
21
+ end
11
22
  end
12
23
  end
13
24
 
14
- context 'scheme is https' do
15
- let(:url) { "https://berkin.com/whatever/else/is/here" }
16
- it 'should return the image as a full url using the host as domain' do
17
- formatted_url.should == 'https://berkin.com/some/stupid/path'
25
+ context 'but the host is present, but the scheme is missing' do
26
+ let(:image_url) { "//berkin.com/some/stupid/path" }
27
+
28
+ it 'should return the image as a full url using http as the protocol' do
29
+ expect(formatted_url).to eq('http://berkin.com/some/stupid/path')
18
30
  end
19
31
  end
20
32
 
@@ -27,7 +39,7 @@ describe Utils::ImageUrlFormatter do
27
39
  end
28
40
 
29
41
  it 'returns nil' do
30
- formatted_url.should == nil
42
+ expect(formatted_url).to be_nil
31
43
  end
32
44
  end
33
45
  end
@@ -2,7 +2,7 @@ require 'spec_helper'
2
2
 
3
3
  describe LinkOracle::Extractor::Body do
4
4
  let(:parsed_body) { ::Nokogiri::HTML.parse(body) }
5
- let(:link_data) { LinkOracle::Extractor::Body.new(parsed_body).perform }
5
+ let(:link_data) { LinkOracle::Extractor::Body.new({ url: 'http://berkin.com', parsed_data: parsed_body}).perform }
6
6
 
7
7
  let(:body) {
8
8
  <<-HTML
@@ -34,6 +34,10 @@ describe LinkOracle::Extractor::Body do
34
34
  }
35
35
 
36
36
  describe 'perform' do
37
+ before do
38
+ FastImage.stub(:size).and_return([0, 0])
39
+ end
40
+
37
41
  context 'there is no suitable stuff in the body' do
38
42
  let(:body) {
39
43
  "<html>
@@ -58,14 +62,6 @@ describe LinkOracle::Extractor::Body do
58
62
  ]
59
63
  end
60
64
 
61
- it 'should populate link_data image_urls' do
62
- link_data.image_urls.should == [
63
- "http://berkin.com",
64
- "http://cherbin.com",
65
- "http://flerbin.com"
66
- ]
67
- end
68
-
69
65
  it 'should populate link_data descriptions' do
70
66
  link_data.descriptions.should == [
71
67
  "paragraph 1",
@@ -73,6 +69,41 @@ describe LinkOracle::Extractor::Body do
73
69
  "paragraph 3"
74
70
  ]
75
71
  end
72
+
73
+ context 'images are a correct size' do
74
+ before do
75
+ FastImage.stub(:size).and_return([100, 121])
76
+ end
77
+
78
+ it 'should populate link_data image_urls' do
79
+ expect(link_data.image_urls).to match_array([
80
+ "http://berkin.com",
81
+ "http://cherbin.com",
82
+ "http://flerbin.com",
83
+ "http://berkin.com/berkin/cherbin.jpg"
84
+ ])
85
+ end
86
+ end
87
+
88
+ context 'images are incorrect size' do
89
+ it 'should populate link_data image_urls' do
90
+ expect(link_data.image_urls).to eq([])
91
+ end
92
+ end
93
+
94
+ context 'some images are correct size and some are not' do
95
+ it 'should populate link_data image_urls only with the correctly sized images' do
96
+ FastImage.should_receive(:size).with("http://berkin.com").and_return([50, 60])
97
+ FastImage.should_receive(:size).with("http://flerbin.com").and_return([60, 55])
98
+ FastImage.should_receive(:size).with("http://cherbin.com").and_return([160, 155])
99
+ FastImage.should_receive(:size).with("http://berkin.com/berkin/cherbin.jpg").and_return([160, 155])
100
+
101
+ expect(link_data.image_urls).to match_array([
102
+ "http://cherbin.com",
103
+ "http://berkin.com/berkin/cherbin.jpg"
104
+ ])
105
+ end
106
+ end
76
107
  end
77
108
  end
78
- end
109
+ end
metadata CHANGED
@@ -1,17 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_oracle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Cooper
8
8
  - Fito von Zastrow
9
9
  - Kane Baccigalupi
10
10
  - Sowjanya Mudunuri
11
+ - Rae Bonfanti
12
+ - Zoe Madden-Wood
11
13
  autorequire:
12
14
  bindir: bin
13
15
  cert_chain: []
14
- date: 2014-06-26 00:00:00.000000000 Z
16
+ date: 2014-07-10 00:00:00.000000000 Z
15
17
  dependencies:
16
18
  - !ruby/object:Gem::Dependency
17
19
  name: nokogiri
@@ -41,6 +43,20 @@ dependencies:
41
43
  - - '>='
42
44
  - !ruby/object:Gem::Version
43
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: fastimage
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ type: :runtime
54
+ prerelease: false
55
+ version_requirements: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
44
60
  - !ruby/object:Gem::Dependency
45
61
  name: bundler
46
62
  requirement: !ruby/object:Gem::Requirement
@@ -147,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
147
163
  version: '0'
148
164
  requirements: []
149
165
  rubyforge_project:
150
- rubygems_version: 2.3.0
166
+ rubygems_version: 2.2.2
151
167
  signing_key:
152
168
  specification_version: 4
153
169
  summary: Scrapes pages for open graph, meta, and lastly, body preview data