ogo 0.0.2 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41df5206b948578ceb2d31a306c73f84f35b837e
4
- data.tar.gz: f871b617057c6b54f11dd2aa255880d6a623db8c
3
+ metadata.gz: 3916d06ba08cf0c304c487d5754590b0bfdb64aa
4
+ data.tar.gz: 06cc4a8c9e9c38f84c86fe56acab9756901ea058
5
5
  SHA512:
6
- metadata.gz: ddeef19fba2db90c1f70729e9d99ed8d7c2f908eafb83b2b791139262a88d66d096cf8b855e695ad6608112f6652f27fc5a21a579e135b067caf0fdb784117e6
7
- data.tar.gz: 1b714a39fece5216852e31eadfbcf9822e50cd3233c98871015b55d0d99ceeed3587132bd6cb96129cef644a62374ff5102654730931eb938f16f12ef96b9fb3
6
+ metadata.gz: c13484e58cf8a94f6cf675f346116277e315bc42ba21b9e031d7abf2be99d48bfffa7a3ab5134bb71e87b134a17c855b62bb7a2cd1af048f29a09fa66eb61d5b
7
+ data.tar.gz: 866bf793b4475f784ca534119063e6ebe18452993ff1fdef32f08f48231c90d9c4bf4f622561512a37b3a83c5024226b92ca6015a184a9d0c1e7a0bb599976af
@@ -0,0 +1,76 @@
1
+ module Ogo
2
+ class ImageInfo
3
+
4
+ attr_accessor :url
5
+
6
+ def initialize(opts={})
7
+ @url = opts[:url]
8
+ @width = opts[:width]
9
+ @height = opts[:height]
10
+ @type = opts[:type]
11
+ end
12
+
13
+ def width
14
+ fetch_size[0]
15
+ end
16
+
17
+ def height
18
+ fetch_size[1]
19
+ end
20
+
21
+ def type
22
+ fetch_type
23
+ end
24
+
25
+ def content_type
26
+ "image/#{fetch_type}"
27
+ end
28
+
29
+ def fetch_size
30
+ return [@width, @height] if @width && @height
31
+ if defined?(FastImage)
32
+ @width, @height = fi_check(:size, url)
33
+ else
34
+ []
35
+ end
36
+ end
37
+
38
+ def fetch_size!
39
+ @width, @height = nil
40
+ fetch_size
41
+ end
42
+
43
+ def fetch_type
44
+ @type ||= \
45
+ if defined?(FastImage)
46
+ fi_check(:type, url).to_s
47
+ else
48
+ uri = Addressable::URI.parse(url).normalize
49
+ uri.path.split('.').last.to_s
50
+ end
51
+ end
52
+
53
+ def fetch_type!
54
+ @type = nil
55
+ fetch_type
56
+ end
57
+
58
+ private
59
+
60
+ def fi_check(method, url, options=nil)
61
+ options ||= {raise_on_failure: true, timeout: 2.0}
62
+ FastImage.send(method, url, options)
63
+ rescue
64
+ begin
65
+ url = Addressable::URI.parse(url).normalize
66
+ val = FastImage.send(method, url, options)
67
+ @url = url
68
+ val
69
+ rescue => e
70
+ puts "Image url error: url=\"#{url}\", error=\"#{e}\""
71
+ nil
72
+ end
73
+ end
74
+
75
+ end
76
+ end
@@ -0,0 +1,60 @@
1
+ module Ogo
2
+ class PageSource
3
+
4
+ attr_reader :url, :src, :charset, :doc
5
+
6
+ def initialize(src, options={})
7
+ @src = src
8
+ @url = options[:url]
9
+ @charset = options[:charset]
10
+ end
11
+
12
+ def parse
13
+ unless charset
14
+ _doc = Nokogiri.parse(src.scrub)
15
+ @charset = guess_encoding(_doc)
16
+ end
17
+ Nokogiri::HTML(src, nil, charset)
18
+ end
19
+
20
+ def parse!
21
+ @doc = parse
22
+ self
23
+ end
24
+
25
+ def inspect
26
+ str = "<Ogo::PageSource:0x00#{'%x' % (self.object_id << 1)}\n"
27
+ str << "url=\"#{url}\",\n"
28
+ str << "charset=\"#{charset}\",\n"
29
+ str << "src=\"#{src.to_s.truncate(100, omission: '...')}\",\n"
30
+ str << "doc=#{doc.to_s.truncate(100, omission: '...')}\" >"
31
+ str
32
+ end
33
+
34
+ def to_s
35
+ inspect
36
+ end
37
+
38
+ private
39
+
40
+ def guess_encoding(_doc)
41
+ _charset = _doc.xpath('//meta/@charset').first
42
+ return _charset.value.to_s if charset
43
+
44
+ _charset = _doc.xpath('//meta').each do |m|
45
+ if content_tag?(m)
46
+ return m.attribute('content').value.split('charset=').last.strip
47
+ end
48
+ end
49
+
50
+ 'UTF-8'
51
+ end
52
+
53
+ def content_tag?(m)
54
+ m.attribute('http-equiv') &&
55
+ m.attribute('content') &&
56
+ m.attribute('http-equiv').value.casecmp('Content-Type')
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,106 @@
1
+ module Ogo
2
+ module Parsers
3
+ class Base
4
+
5
+ attr_reader :page, :url
6
+
7
+ def initialize(parseable)
8
+ @page = \
9
+ if parseable.include?('</html>')
10
+ @url = ''
11
+ Ogo::PageSource.new(parseable).parse!
12
+ else
13
+ _rf = Ogo::Utils::RedirectFollower.new(parseable).resolve
14
+ page = Ogo::PageSource.new(_rf.body, charset: _rf.charset, url: _rf.url)
15
+ @url = _rf.url
16
+ page.parse!
17
+ end
18
+ @type = 'website'
19
+ end
20
+
21
+ def title(fallback=false)
22
+ title_tag = page.doc.xpath('//head//title').first
23
+ title_tag && title_tag.text.to_s.strip
24
+ end
25
+
26
+ def description(fallback=false)
27
+ description_meta = page.doc.xpath("//head//meta[@name='description']").first
28
+ _desc = description_meta && description_meta.attribute("content").to_s.strip
29
+ if !_desc || _desc.empty?
30
+ _desc = fetch_first_text
31
+ end
32
+ _desc
33
+ end
34
+
35
+ def image(fallback=false)
36
+ all_images.first
37
+ end
38
+
39
+ def type(fallback=false)
40
+ @type
41
+ end
42
+
43
+ def all_images
44
+ @all_images ||= \
45
+ begin
46
+ imgs = (
47
+ fetch_images("//head//meta[@itemprop='image']", "content") +
48
+ fetch_images("//head//meta[@itemprop='logo']", "content") +
49
+ fetch_images("//head//meta[@property='og:image']", "content") +
50
+ fetch_images("//head//meta[@property='twitter:image:src']", "content") +
51
+ fetch_images("//head//link[@rel='image_src']", "href") +
52
+ fetch_images("//img", "src")
53
+ ).flatten.compact.uniq
54
+ host_uri = Addressable::URI.parse(url)
55
+ imgs.map { |img|
56
+ Ogo::ImageInfo.new(url: fix_image_path(img, host_uri))
57
+ }
58
+ end
59
+ end
60
+
61
+ def metadata(fallback=false)
62
+ _meta = {
63
+ title: title,
64
+ description: description,
65
+ type: type,
66
+ image: nil
67
+ }
68
+ if image
69
+ _meta[:image] = {
70
+ url: image.url,
71
+ width: image.width,
72
+ height: image.height,
73
+ type: image.type
74
+ }
75
+ end
76
+ _meta
77
+ end
78
+
79
+ private
80
+
81
+ def fix_image_path(img, host_uri)
82
+ return "http:#{img}" if img.start_with?('//')
83
+ return img if host_uri.host.nil?
84
+ if Addressable::URI.parse(img).host.nil?
85
+ host_uri.join(img).to_s
86
+ else
87
+ img
88
+ end
89
+ end
90
+
91
+ def fetch_first_text
92
+ page.doc.xpath('//p').each do |p|
93
+ s = p.text.to_s.strip
94
+ return s if s.length > 20
95
+ end
96
+ end
97
+
98
+ def fetch_images(xpath_str, attr)
99
+ page.doc.xpath(xpath_str).map do |tag|
100
+ tag.attribute(attr).to_s.strip
101
+ end.reject { |it| it.empty? }.uniq
102
+ end
103
+
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,90 @@
1
+ module Ogo
2
+ module Parsers
3
+ class Opengraph < Ogo::Parsers::Base
4
+
5
+ def initialize(parseable, fallback=false)
6
+ @global_fallback = fallback
7
+ super parseable
8
+ end
9
+
10
+ def title(fallback=false)
11
+ if fallback
12
+ super
13
+ else
14
+ _val = find_meta('title')
15
+ (!_val.empty? && _val) ||
16
+ (@global_fallback && super) ||
17
+ ''
18
+ end
19
+ end
20
+
21
+ def description(fallback=false)
22
+ if fallback
23
+ super
24
+ else
25
+ _val = find_meta('description')
26
+ (!_val.empty? && _val) ||
27
+ (@global_fallback && super) ||
28
+ ''
29
+ end
30
+ end
31
+
32
+ def image(fallback=false)
33
+ if fallback
34
+ super
35
+ else
36
+ _val = find_meta('image')
37
+ if _val.empty?
38
+ (@global_fallback && super) || nil
39
+ else
40
+ host_uri = Addressable::URI.parse(url)
41
+ Ogo::ImageInfo.new(url: fix_image_path(_val, host_uri))
42
+ end
43
+ end
44
+ end
45
+
46
+ def type(fallback=false)
47
+ if fallback
48
+ super
49
+ else
50
+ _val = find_meta('type')
51
+ (!_val.empty? && _val) ||
52
+ (@global_fallback && super) ||
53
+ ''
54
+ end
55
+ end
56
+
57
+ def metadata(fallback=false)
58
+ _meta = super
59
+ if fallback
60
+ _meta[:fallback] = {
61
+ title: title(true),
62
+ description: description(true),
63
+ type: type(true),
64
+ image: nil
65
+ }
66
+ img = image(true)
67
+ if img
68
+ _meta[:fallback][:image] = {
69
+ url: img.url,
70
+ width: img.width,
71
+ height: img.height,
72
+ type: img.type
73
+ }
74
+ end
75
+ end
76
+ _meta
77
+ end
78
+
79
+ private
80
+
81
+ def find_meta(meta_type)
82
+ tag = page.doc.xpath('//head//meta').find { |it|
83
+ it.attribute('property').to_s == "og:#{meta_type}"
84
+ }
85
+ (tag && tag.attribute('content')).to_s.strip
86
+ end
87
+
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,6 @@
1
+ module Ogo
2
+ module Parsers
3
+ require_relative 'parsers/base'
4
+ require_relative 'parsers/opengraph'
5
+ end
6
+ end
data/lib/ogo/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Ogo
2
- VERSION = '0.0.2'
2
+ VERSION = '0.1.1'
3
3
  end
data/lib/ogo.rb CHANGED
@@ -2,8 +2,11 @@ require 'addressable/uri'
2
2
  require 'uri'
3
3
  require 'nokogiri'
4
4
 
5
- require_relative 'ogo/opengraph'
5
+ require_relative 'ogo/page_source'
6
+ require_relative 'ogo/image_info'
7
+ require_relative 'ogo/parsers'
6
8
  require_relative 'ogo/utils/redirect_follower'
9
+ require_relative 'ogo/version'
7
10
 
8
11
  module Ogo
9
12
 
data/ogo.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
19
19
  s.add_dependency 'nokogiri', '>= 1.6'
20
20
  s.add_dependency 'addressable', '>= 2.4.0'
21
21
  s.add_development_dependency 'rspec', '>= 3.0'
22
+ s.add_development_dependency 'fastimage'
22
23
  s.add_development_dependency 'pry'
23
24
  s.add_development_dependency 'byebug'
24
25
  s.add_development_dependency 'rake'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ogo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - gazay
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-24 00:00:00.000000000 Z
11
+ date: 2016-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fastimage
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: pry
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -107,7 +121,11 @@ files:
107
121
  - README.md
108
122
  - Rakefile
109
123
  - lib/ogo.rb
110
- - lib/ogo/opengraph.rb
124
+ - lib/ogo/image_info.rb
125
+ - lib/ogo/page_source.rb
126
+ - lib/ogo/parsers.rb
127
+ - lib/ogo/parsers/base.rb
128
+ - lib/ogo/parsers/opengraph.rb
111
129
  - lib/ogo/utils/redirect_follower.rb
112
130
  - lib/ogo/version.rb
113
131
  - ogo.gemspec
data/lib/ogo/opengraph.rb DELETED
@@ -1,179 +0,0 @@
1
- module Ogo
2
- class Opengraph
3
-
4
- attr_accessor :src, :url, :type, :title, :description,
5
- :images, :metadata, :response, :body, :charset, :original_images,
6
- :error
7
-
8
- def initialize(src, options = {})
9
- @src = src
10
- @body = nil
11
- @images = []
12
- @metadata = {}
13
- @charset = 'utf-8'
14
- @error = nil
15
-
16
- @_fallback = options[:fallback] || true
17
- @_options = options
18
- end
19
-
20
- def parse
21
- parse_opengraph(@_options)
22
- load_fallback if @_fallback
23
- check_images_path
24
- self
25
- end
26
-
27
- def parse!
28
- parse
29
- error ? raise(error) : self
30
- end
31
-
32
- def inspect
33
- str = "<Ogo::Opengraph:0x00#{'%x' % (self.object_id << 1)}\nurl=\"#{url}\",\nmetadata=#{metadata},\n"
34
- str << "images=#{images},\ntype=\"#{type}\",\ntitle=\"#{title}\">"
35
- str
36
- end
37
-
38
- def to_s
39
- inspect
40
- end
41
-
42
- private
43
-
44
- def parse_opengraph(options = {})
45
- begin
46
- if src.include? '</html>'
47
- self.body = src
48
- else
49
- resolved = Ogo::Utils::RedirectFollower.new(src, options).resolve
50
- self.body = resolved.body
51
- self.charset = resolved.charset if resolved.charset
52
- end
53
- rescue => e
54
- self.title = self.url = src
55
- self.error = e
56
- return
57
- end
58
-
59
- if body
60
- attrs_list = %w(title url type description)
61
- doc = parse_html
62
- doc.css('meta').each do |m|
63
- if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
64
- m_content = m.attribute('content').to_s.strip
65
- metadata_name = m.attribute('property').to_s.gsub("og:", "")
66
- self.metadata = add_metadata(metadata, metadata_name, m_content)
67
- case metadata_name
68
- when *attrs_list
69
- self.instance_variable_set("@#{metadata_name}", m_content) unless m_content.empty?
70
- when "image"
71
- add_image(m_content)
72
- end
73
- end
74
- end
75
- end
76
- end
77
-
78
- def load_fallback
79
- if body
80
- doc = parse_html
81
-
82
- if title.to_s.empty? && doc.xpath("//head//title").size > 0
83
- self.title = doc.xpath("//head//title").first.text.to_s.strip
84
- end
85
-
86
- self.url = src if url.to_s.empty?
87
-
88
- if description.to_s.empty? && description_meta = doc.xpath("//head//meta[@name='description']").first
89
- self.description = description_meta.attribute("content").to_s.strip
90
- end
91
-
92
- if description.to_s.empty?
93
- self.description = fetch_first_text(doc)
94
- end
95
-
96
- fetch_images(doc, "//head//link[@rel='image_src']", "href") if images.empty?
97
- fetch_images(doc, "//img", "src") if images.empty?
98
- end
99
- end
100
-
101
- def check_images_path
102
- self.original_images = images.dup
103
- uri = Addressable::URI.parse(url)
104
- imgs = images.dup
105
- self.images = []
106
- imgs.each do |img|
107
- if Addressable::URI.parse(img).host.nil?
108
- full_path = uri.join(img).to_s
109
- add_image(full_path)
110
- else
111
- add_image(img)
112
- end
113
- end
114
- end
115
-
116
- def add_image(image_url)
117
- unless images.include?(image_url) || image_url.to_s.empty?
118
- self.images << image_url
119
- end
120
- end
121
-
122
- def fetch_images(doc, xpath_str, attr)
123
- doc.xpath(xpath_str).each do |link|
124
- add_image(link.attribute(attr).to_s.strip)
125
- end
126
- end
127
-
128
- def fetch_first_text(doc)
129
- doc.xpath('//p').each do |p|
130
- s = p.text.to_s.strip
131
- return s if s.length > 20
132
- end
133
- end
134
-
135
- def add_metadata(metadata_container, path, content)
136
- path_elements = path.split(':')
137
- if path_elements.size > 1
138
- current_element = path_elements.delete_at(0)
139
- path = path_elements.join(':')
140
- if metadata_container[current_element.to_sym]
141
- path_pointer = metadata_container[current_element.to_sym].last
142
- index_count = metadata_container[current_element.to_sym].size
143
- metadata_container[current_element.to_sym][index_count - 1] = add_metadata(path_pointer, path, content)
144
- metadata_container
145
- else
146
- metadata_container[current_element.to_sym] = []
147
- metadata_container[current_element.to_sym] << add_metadata({}, path, content)
148
- metadata_container
149
- end
150
- else
151
- metadata_container[path.to_sym] ||= []
152
- metadata_container[path.to_sym] << {'_value'.to_sym => content}
153
- metadata_container
154
- end
155
- end
156
-
157
- def parse_html
158
- unless charset
159
- doc = Nokogiri.parse(body.scrub)
160
- self.charset = guess_encoding(doc)
161
- end
162
- Nokogiri::HTML(body, nil, charset)
163
- end
164
-
165
- def guess_encoding(doc)
166
- _charset = doc.xpath('//meta/@charset').first
167
- return _charset.value.to_s if charset
168
-
169
- _charset = doc.xpath('//meta').each do |m|
170
- if m.attribute('http-equiv') && m.attribute('content') && m.attribute('http-equiv').value.casecmp('Content-Type')
171
- return m.attribute('content').value.split('charset=').last.strip
172
- end
173
- end
174
-
175
- 'UTF-8'
176
- end
177
-
178
- end
179
- end