ogo 0.0.2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41df5206b948578ceb2d31a306c73f84f35b837e
4
- data.tar.gz: f871b617057c6b54f11dd2aa255880d6a623db8c
3
+ metadata.gz: 3916d06ba08cf0c304c487d5754590b0bfdb64aa
4
+ data.tar.gz: 06cc4a8c9e9c38f84c86fe56acab9756901ea058
5
5
  SHA512:
6
- metadata.gz: ddeef19fba2db90c1f70729e9d99ed8d7c2f908eafb83b2b791139262a88d66d096cf8b855e695ad6608112f6652f27fc5a21a579e135b067caf0fdb784117e6
7
- data.tar.gz: 1b714a39fece5216852e31eadfbcf9822e50cd3233c98871015b55d0d99ceeed3587132bd6cb96129cef644a62374ff5102654730931eb938f16f12ef96b9fb3
6
+ metadata.gz: c13484e58cf8a94f6cf675f346116277e315bc42ba21b9e031d7abf2be99d48bfffa7a3ab5134bb71e87b134a17c855b62bb7a2cd1af048f29a09fa66eb61d5b
7
+ data.tar.gz: 866bf793b4475f784ca534119063e6ebe18452993ff1fdef32f08f48231c90d9c4bf4f622561512a37b3a83c5024226b92ca6015a184a9d0c1e7a0bb599976af
@@ -0,0 +1,76 @@
1
+ module Ogo
2
+ class ImageInfo
3
+
4
+ attr_accessor :url
5
+
6
+ def initialize(opts={})
7
+ @url = opts[:url]
8
+ @width = opts[:width]
9
+ @height = opts[:height]
10
+ @type = opts[:type]
11
+ end
12
+
13
+ def width
14
+ fetch_size[0]
15
+ end
16
+
17
+ def height
18
+ fetch_size[1]
19
+ end
20
+
21
+ def type
22
+ fetch_type
23
+ end
24
+
25
+ def content_type
26
+ "image/#{fetch_type}"
27
+ end
28
+
29
+ def fetch_size
30
+ return [@width, @height] if @width && @height
31
+ if defined?(FastImage)
32
+ @width, @height = fi_check(:size, url)
33
+ else
34
+ []
35
+ end
36
+ end
37
+
38
+ def fetch_size!
39
+ @width, @height = nil
40
+ fetch_size
41
+ end
42
+
43
+ def fetch_type
44
+ @type ||= \
45
+ if defined?(FastImage)
46
+ fi_check(:type, url).to_s
47
+ else
48
+ uri = Addressable::URI.parse(url).normalize
49
+ uri.path.split('.').last.to_s
50
+ end
51
+ end
52
+
53
+ def fetch_type!
54
+ @type = nil
55
+ fetch_type
56
+ end
57
+
58
+ private
59
+
60
+ def fi_check(method, url, options=nil)
61
+ options ||= {raise_on_failure: true, timeout: 2.0}
62
+ FastImage.send(method, url, options)
63
+ rescue
64
+ begin
65
+ url = Addressable::URI.parse(url).normalize
66
+ val = FastImage.send(method, url, options)
67
+ @url = url
68
+ val
69
+ rescue => e
70
+ puts "Image url error: url=\"#{url}\", error=\"#{e}\""
71
+ nil
72
+ end
73
+ end
74
+
75
+ end
76
+ end
@@ -0,0 +1,60 @@
1
+ module Ogo
2
+ class PageSource
3
+
4
+ attr_reader :url, :src, :charset, :doc
5
+
6
+ def initialize(src, options={})
7
+ @src = src
8
+ @url = options[:url]
9
+ @charset = options[:charset]
10
+ end
11
+
12
+ def parse
13
+ unless charset
14
+ _doc = Nokogiri.parse(src.scrub)
15
+ @charset = guess_encoding(_doc)
16
+ end
17
+ Nokogiri::HTML(src, nil, charset)
18
+ end
19
+
20
+ def parse!
21
+ @doc = parse
22
+ self
23
+ end
24
+
25
+ def inspect
26
+ str = "<Ogo::PageSource:0x00#{'%x' % (self.object_id << 1)}\n"
27
+ str << "url=\"#{url}\",\n"
28
+ str << "charset=\"#{charset}\",\n"
29
+ str << "src=\"#{src.to_s.truncate(100, omission: '...')}\",\n"
30
+ str << "doc=#{doc.to_s.truncate(100, omission: '...')}\" >"
31
+ str
32
+ end
33
+
34
+ def to_s
35
+ inspect
36
+ end
37
+
38
+ private
39
+
40
+ def guess_encoding(_doc)
41
+ _charset = _doc.xpath('//meta/@charset').first
42
+ return _charset.value.to_s if charset
43
+
44
+ _charset = _doc.xpath('//meta').each do |m|
45
+ if content_tag?(m)
46
+ return m.attribute('content').value.split('charset=').last.strip
47
+ end
48
+ end
49
+
50
+ 'UTF-8'
51
+ end
52
+
53
+ def content_tag?(m)
54
+ m.attribute('http-equiv') &&
55
+ m.attribute('content') &&
56
+ m.attribute('http-equiv').value.casecmp('Content-Type')
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,106 @@
1
+ module Ogo
2
+ module Parsers
3
+ class Base
4
+
5
+ attr_reader :page, :url
6
+
7
+ def initialize(parseable)
8
+ @page = \
9
+ if parseable.include?('</html>')
10
+ @url = ''
11
+ Ogo::PageSource.new(parseable).parse!
12
+ else
13
+ _rf = Ogo::Utils::RedirectFollower.new(parseable).resolve
14
+ page = Ogo::PageSource.new(_rf.body, charset: _rf.charset, url: _rf.url)
15
+ @url = _rf.url
16
+ page.parse!
17
+ end
18
+ @type = 'website'
19
+ end
20
+
21
+ def title(fallback=false)
22
+ title_tag = page.doc.xpath('//head//title').first
23
+ title_tag && title_tag.text.to_s.strip
24
+ end
25
+
26
+ def description(fallback=false)
27
+ description_meta = page.doc.xpath("//head//meta[@name='description']").first
28
+ _desc = description_meta && description_meta.attribute("content").to_s.strip
29
+ if !_desc || _desc.empty?
30
+ _desc = fetch_first_text
31
+ end
32
+ _desc
33
+ end
34
+
35
+ def image(fallback=false)
36
+ all_images.first
37
+ end
38
+
39
+ def type(fallback=false)
40
+ @type
41
+ end
42
+
43
+ def all_images
44
+ @all_images ||= \
45
+ begin
46
+ imgs = (
47
+ fetch_images("//head//meta[@itemprop='image']", "content") +
48
+ fetch_images("//head//meta[@itemprop='logo']", "content") +
49
+ fetch_images("//head//meta[@property='og:image']", "content") +
50
+ fetch_images("//head//meta[@property='twitter:image:src']", "content") +
51
+ fetch_images("//head//link[@rel='image_src']", "href") +
52
+ fetch_images("//img", "src")
53
+ ).flatten.compact.uniq
54
+ host_uri = Addressable::URI.parse(url)
55
+ imgs.map { |img|
56
+ Ogo::ImageInfo.new(url: fix_image_path(img, host_uri))
57
+ }
58
+ end
59
+ end
60
+
61
+ def metadata(fallback=false)
62
+ _meta = {
63
+ title: title,
64
+ description: description,
65
+ type: type,
66
+ image: nil
67
+ }
68
+ if image
69
+ _meta[:image] = {
70
+ url: image.url,
71
+ width: image.width,
72
+ height: image.height,
73
+ type: image.type
74
+ }
75
+ end
76
+ _meta
77
+ end
78
+
79
+ private
80
+
81
+ def fix_image_path(img, host_uri)
82
+ return "http:#{img}" if img.start_with?('//')
83
+ return img if host_uri.host.nil?
84
+ if Addressable::URI.parse(img).host.nil?
85
+ host_uri.join(img).to_s
86
+ else
87
+ img
88
+ end
89
+ end
90
+
91
+ def fetch_first_text
92
+ page.doc.xpath('//p').each do |p|
93
+ s = p.text.to_s.strip
94
+ return s if s.length > 20
95
+ end
96
+ end
97
+
98
+ def fetch_images(xpath_str, attr)
99
+ page.doc.xpath(xpath_str).map do |tag|
100
+ tag.attribute(attr).to_s.strip
101
+ end.reject { |it| it.empty? }.uniq
102
+ end
103
+
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,90 @@
1
+ module Ogo
2
+ module Parsers
3
+ class Opengraph < Ogo::Parsers::Base
4
+
5
+ def initialize(parseable, fallback=false)
6
+ @global_fallback = fallback
7
+ super parseable
8
+ end
9
+
10
+ def title(fallback=false)
11
+ if fallback
12
+ super
13
+ else
14
+ _val = find_meta('title')
15
+ (!_val.empty? && _val) ||
16
+ (@global_fallback && super) ||
17
+ ''
18
+ end
19
+ end
20
+
21
+ def description(fallback=false)
22
+ if fallback
23
+ super
24
+ else
25
+ _val = find_meta('description')
26
+ (!_val.empty? && _val) ||
27
+ (@global_fallback && super) ||
28
+ ''
29
+ end
30
+ end
31
+
32
+ def image(fallback=false)
33
+ if fallback
34
+ super
35
+ else
36
+ _val = find_meta('image')
37
+ if _val.empty?
38
+ (@global_fallback && super) || nil
39
+ else
40
+ host_uri = Addressable::URI.parse(url)
41
+ Ogo::ImageInfo.new(url: fix_image_path(_val, host_uri))
42
+ end
43
+ end
44
+ end
45
+
46
+ def type(fallback=false)
47
+ if fallback
48
+ super
49
+ else
50
+ _val = find_meta('type')
51
+ (!_val.empty? && _val) ||
52
+ (@global_fallback && super) ||
53
+ ''
54
+ end
55
+ end
56
+
57
+ def metadata(fallback=false)
58
+ _meta = super
59
+ if fallback
60
+ _meta[:fallback] = {
61
+ title: title(true),
62
+ description: description(true),
63
+ type: type(true),
64
+ image: nil
65
+ }
66
+ img = image(true)
67
+ if img
68
+ _meta[:fallback][:image] = {
69
+ url: img.url,
70
+ width: img.width,
71
+ height: img.height,
72
+ type: img.type
73
+ }
74
+ end
75
+ end
76
+ _meta
77
+ end
78
+
79
+ private
80
+
81
+ def find_meta(meta_type)
82
+ tag = page.doc.xpath('//head//meta').find { |it|
83
+ it.attribute('property').to_s == "og:#{meta_type}"
84
+ }
85
+ (tag && tag.attribute('content')).to_s.strip
86
+ end
87
+
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,6 @@
1
+ module Ogo
2
+ module Parsers
3
+ require_relative 'parsers/base'
4
+ require_relative 'parsers/opengraph'
5
+ end
6
+ end
data/lib/ogo/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Ogo
2
- VERSION = '0.0.2'
2
+ VERSION = '0.1.1'
3
3
  end
data/lib/ogo.rb CHANGED
@@ -2,8 +2,11 @@ require 'addressable/uri'
2
2
  require 'uri'
3
3
  require 'nokogiri'
4
4
 
5
- require_relative 'ogo/opengraph'
5
+ require_relative 'ogo/page_source'
6
+ require_relative 'ogo/image_info'
7
+ require_relative 'ogo/parsers'
6
8
  require_relative 'ogo/utils/redirect_follower'
9
+ require_relative 'ogo/version'
7
10
 
8
11
  module Ogo
9
12
 
data/ogo.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
19
19
  s.add_dependency 'nokogiri', '>= 1.6'
20
20
  s.add_dependency 'addressable', '>= 2.4.0'
21
21
  s.add_development_dependency 'rspec', '>= 3.0'
22
+ s.add_development_dependency 'fastimage'
22
23
  s.add_development_dependency 'pry'
23
24
  s.add_development_dependency 'byebug'
24
25
  s.add_development_dependency 'rake'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ogo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - gazay
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-24 00:00:00.000000000 Z
11
+ date: 2016-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fastimage
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: pry
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -107,7 +121,11 @@ files:
107
121
  - README.md
108
122
  - Rakefile
109
123
  - lib/ogo.rb
110
- - lib/ogo/opengraph.rb
124
+ - lib/ogo/image_info.rb
125
+ - lib/ogo/page_source.rb
126
+ - lib/ogo/parsers.rb
127
+ - lib/ogo/parsers/base.rb
128
+ - lib/ogo/parsers/opengraph.rb
111
129
  - lib/ogo/utils/redirect_follower.rb
112
130
  - lib/ogo/version.rb
113
131
  - ogo.gemspec
data/lib/ogo/opengraph.rb DELETED
@@ -1,179 +0,0 @@
1
- module Ogo
2
- class Opengraph
3
-
4
- attr_accessor :src, :url, :type, :title, :description,
5
- :images, :metadata, :response, :body, :charset, :original_images,
6
- :error
7
-
8
- def initialize(src, options = {})
9
- @src = src
10
- @body = nil
11
- @images = []
12
- @metadata = {}
13
- @charset = 'utf-8'
14
- @error = nil
15
-
16
- @_fallback = options[:fallback] || true
17
- @_options = options
18
- end
19
-
20
- def parse
21
- parse_opengraph(@_options)
22
- load_fallback if @_fallback
23
- check_images_path
24
- self
25
- end
26
-
27
- def parse!
28
- parse
29
- error ? raise(error) : self
30
- end
31
-
32
- def inspect
33
- str = "<Ogo::Opengraph:0x00#{'%x' % (self.object_id << 1)}\nurl=\"#{url}\",\nmetadata=#{metadata},\n"
34
- str << "images=#{images},\ntype=\"#{type}\",\ntitle=\"#{title}\">"
35
- str
36
- end
37
-
38
- def to_s
39
- inspect
40
- end
41
-
42
- private
43
-
44
- def parse_opengraph(options = {})
45
- begin
46
- if src.include? '</html>'
47
- self.body = src
48
- else
49
- resolved = Ogo::Utils::RedirectFollower.new(src, options).resolve
50
- self.body = resolved.body
51
- self.charset = resolved.charset if resolved.charset
52
- end
53
- rescue => e
54
- self.title = self.url = src
55
- self.error = e
56
- return
57
- end
58
-
59
- if body
60
- attrs_list = %w(title url type description)
61
- doc = parse_html
62
- doc.css('meta').each do |m|
63
- if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
64
- m_content = m.attribute('content').to_s.strip
65
- metadata_name = m.attribute('property').to_s.gsub("og:", "")
66
- self.metadata = add_metadata(metadata, metadata_name, m_content)
67
- case metadata_name
68
- when *attrs_list
69
- self.instance_variable_set("@#{metadata_name}", m_content) unless m_content.empty?
70
- when "image"
71
- add_image(m_content)
72
- end
73
- end
74
- end
75
- end
76
- end
77
-
78
- def load_fallback
79
- if body
80
- doc = parse_html
81
-
82
- if title.to_s.empty? && doc.xpath("//head//title").size > 0
83
- self.title = doc.xpath("//head//title").first.text.to_s.strip
84
- end
85
-
86
- self.url = src if url.to_s.empty?
87
-
88
- if description.to_s.empty? && description_meta = doc.xpath("//head//meta[@name='description']").first
89
- self.description = description_meta.attribute("content").to_s.strip
90
- end
91
-
92
- if description.to_s.empty?
93
- self.description = fetch_first_text(doc)
94
- end
95
-
96
- fetch_images(doc, "//head//link[@rel='image_src']", "href") if images.empty?
97
- fetch_images(doc, "//img", "src") if images.empty?
98
- end
99
- end
100
-
101
- def check_images_path
102
- self.original_images = images.dup
103
- uri = Addressable::URI.parse(url)
104
- imgs = images.dup
105
- self.images = []
106
- imgs.each do |img|
107
- if Addressable::URI.parse(img).host.nil?
108
- full_path = uri.join(img).to_s
109
- add_image(full_path)
110
- else
111
- add_image(img)
112
- end
113
- end
114
- end
115
-
116
- def add_image(image_url)
117
- unless images.include?(image_url) || image_url.to_s.empty?
118
- self.images << image_url
119
- end
120
- end
121
-
122
- def fetch_images(doc, xpath_str, attr)
123
- doc.xpath(xpath_str).each do |link|
124
- add_image(link.attribute(attr).to_s.strip)
125
- end
126
- end
127
-
128
- def fetch_first_text(doc)
129
- doc.xpath('//p').each do |p|
130
- s = p.text.to_s.strip
131
- return s if s.length > 20
132
- end
133
- end
134
-
135
- def add_metadata(metadata_container, path, content)
136
- path_elements = path.split(':')
137
- if path_elements.size > 1
138
- current_element = path_elements.delete_at(0)
139
- path = path_elements.join(':')
140
- if metadata_container[current_element.to_sym]
141
- path_pointer = metadata_container[current_element.to_sym].last
142
- index_count = metadata_container[current_element.to_sym].size
143
- metadata_container[current_element.to_sym][index_count - 1] = add_metadata(path_pointer, path, content)
144
- metadata_container
145
- else
146
- metadata_container[current_element.to_sym] = []
147
- metadata_container[current_element.to_sym] << add_metadata({}, path, content)
148
- metadata_container
149
- end
150
- else
151
- metadata_container[path.to_sym] ||= []
152
- metadata_container[path.to_sym] << {'_value'.to_sym => content}
153
- metadata_container
154
- end
155
- end
156
-
157
- def parse_html
158
- unless charset
159
- doc = Nokogiri.parse(body.scrub)
160
- self.charset = guess_encoding(doc)
161
- end
162
- Nokogiri::HTML(body, nil, charset)
163
- end
164
-
165
- def guess_encoding(doc)
166
- _charset = doc.xpath('//meta/@charset').first
167
- return _charset.value.to_s if charset
168
-
169
- _charset = doc.xpath('//meta').each do |m|
170
- if m.attribute('http-equiv') && m.attribute('content') && m.attribute('http-equiv').value.casecmp('Content-Type')
171
- return m.attribute('content').value.split('charset=').last.strip
172
- end
173
- end
174
-
175
- 'UTF-8'
176
- end
177
-
178
- end
179
- end