ogo 0.0.2 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/ogo/image_info.rb +76 -0
- data/lib/ogo/page_source.rb +60 -0
- data/lib/ogo/parsers/base.rb +106 -0
- data/lib/ogo/parsers/opengraph.rb +90 -0
- data/lib/ogo/parsers.rb +6 -0
- data/lib/ogo/version.rb +1 -1
- data/lib/ogo.rb +4 -1
- data/ogo.gemspec +1 -0
- metadata +21 -3
- data/lib/ogo/opengraph.rb +0 -179
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3916d06ba08cf0c304c487d5754590b0bfdb64aa
|
4
|
+
data.tar.gz: 06cc4a8c9e9c38f84c86fe56acab9756901ea058
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c13484e58cf8a94f6cf675f346116277e315bc42ba21b9e031d7abf2be99d48bfffa7a3ab5134bb71e87b134a17c855b62bb7a2cd1af048f29a09fa66eb61d5b
|
7
|
+
data.tar.gz: 866bf793b4475f784ca534119063e6ebe18452993ff1fdef32f08f48231c90d9c4bf4f622561512a37b3a83c5024226b92ca6015a184a9d0c1e7a0bb599976af
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Ogo
|
2
|
+
class ImageInfo
|
3
|
+
|
4
|
+
attr_accessor :url
|
5
|
+
|
6
|
+
def initialize(opts={})
|
7
|
+
@url = opts[:url]
|
8
|
+
@width = opts[:width]
|
9
|
+
@height = opts[:height]
|
10
|
+
@type = opts[:type]
|
11
|
+
end
|
12
|
+
|
13
|
+
def width
|
14
|
+
fetch_size[0]
|
15
|
+
end
|
16
|
+
|
17
|
+
def height
|
18
|
+
fetch_size[1]
|
19
|
+
end
|
20
|
+
|
21
|
+
def type
|
22
|
+
fetch_type
|
23
|
+
end
|
24
|
+
|
25
|
+
def content_type
|
26
|
+
"image/#{fetch_type}"
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_size
|
30
|
+
return [@width, @height] if @width && @height
|
31
|
+
if defined?(FastImage)
|
32
|
+
@width, @height = fi_check(:size, url)
|
33
|
+
else
|
34
|
+
[]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def fetch_size!
|
39
|
+
@width, @height = nil
|
40
|
+
fetch_size
|
41
|
+
end
|
42
|
+
|
43
|
+
def fetch_type
|
44
|
+
@type ||= \
|
45
|
+
if defined?(FastImage)
|
46
|
+
fi_check(:type, url).to_s
|
47
|
+
else
|
48
|
+
uri = Addressable::URI.parse(url).normalize
|
49
|
+
uri.path.split('.').last.to_s
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_type!
|
54
|
+
@type = nil
|
55
|
+
fetch_type
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def fi_check(method, url, options=nil)
|
61
|
+
options ||= {raise_on_failure: true, timeout: 2.0}
|
62
|
+
FastImage.send(method, url, options)
|
63
|
+
rescue
|
64
|
+
begin
|
65
|
+
url = Addressable::URI.parse(url).normalize
|
66
|
+
val = FastImage.send(method, url, options)
|
67
|
+
@url = url
|
68
|
+
val
|
69
|
+
rescue => e
|
70
|
+
puts "Image url error: url=\"#{url}\", error=\"#{e}\""
|
71
|
+
nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Ogo
|
2
|
+
class PageSource
|
3
|
+
|
4
|
+
attr_reader :url, :src, :charset, :doc
|
5
|
+
|
6
|
+
def initialize(src, options={})
|
7
|
+
@src = src
|
8
|
+
@url = options[:url]
|
9
|
+
@charset = options[:charset]
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse
|
13
|
+
unless charset
|
14
|
+
_doc = Nokogiri.parse(src.scrub)
|
15
|
+
@charset = guess_encoding(_doc)
|
16
|
+
end
|
17
|
+
Nokogiri::HTML(src, nil, charset)
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse!
|
21
|
+
@doc = parse
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def inspect
|
26
|
+
str = "<Ogo::PageSource:0x00#{'%x' % (self.object_id << 1)}\n"
|
27
|
+
str << "url=\"#{url}\",\n"
|
28
|
+
str << "charset=\"#{charset}\",\n"
|
29
|
+
str << "src=\"#{src.to_s.truncate(100, omission: '...')}\",\n"
|
30
|
+
str << "doc=#{doc.to_s.truncate(100, omission: '...')}\" >"
|
31
|
+
str
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
inspect
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def guess_encoding(_doc)
|
41
|
+
_charset = _doc.xpath('//meta/@charset').first
|
42
|
+
return _charset.value.to_s if charset
|
43
|
+
|
44
|
+
_charset = _doc.xpath('//meta').each do |m|
|
45
|
+
if content_tag?(m)
|
46
|
+
return m.attribute('content').value.split('charset=').last.strip
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
'UTF-8'
|
51
|
+
end
|
52
|
+
|
53
|
+
def content_tag?(m)
|
54
|
+
m.attribute('http-equiv') &&
|
55
|
+
m.attribute('content') &&
|
56
|
+
m.attribute('http-equiv').value.casecmp('Content-Type')
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Ogo
|
2
|
+
module Parsers
|
3
|
+
class Base
|
4
|
+
|
5
|
+
attr_reader :page, :url
|
6
|
+
|
7
|
+
def initialize(parseable)
|
8
|
+
@page = \
|
9
|
+
if parseable.include?('</html>')
|
10
|
+
@url = ''
|
11
|
+
Ogo::PageSource.new(parseable).parse!
|
12
|
+
else
|
13
|
+
_rf = Ogo::Utils::RedirectFollower.new(parseable).resolve
|
14
|
+
page = Ogo::PageSource.new(_rf.body, charset: _rf.charset, url: _rf.url)
|
15
|
+
@url = _rf.url
|
16
|
+
page.parse!
|
17
|
+
end
|
18
|
+
@type = 'website'
|
19
|
+
end
|
20
|
+
|
21
|
+
def title(fallback=false)
|
22
|
+
title_tag = page.doc.xpath('//head//title').first
|
23
|
+
title_tag && title_tag.text.to_s.strip
|
24
|
+
end
|
25
|
+
|
26
|
+
def description(fallback=false)
|
27
|
+
description_meta = page.doc.xpath("//head//meta[@name='description']").first
|
28
|
+
_desc = description_meta && description_meta.attribute("content").to_s.strip
|
29
|
+
if !_desc || _desc.empty?
|
30
|
+
_desc = fetch_first_text
|
31
|
+
end
|
32
|
+
_desc
|
33
|
+
end
|
34
|
+
|
35
|
+
def image(fallback=false)
|
36
|
+
all_images.first
|
37
|
+
end
|
38
|
+
|
39
|
+
def type(fallback=false)
|
40
|
+
@type
|
41
|
+
end
|
42
|
+
|
43
|
+
def all_images
|
44
|
+
@all_images ||= \
|
45
|
+
begin
|
46
|
+
imgs = (
|
47
|
+
fetch_images("//head//meta[@itemprop='image']", "content") +
|
48
|
+
fetch_images("//head//meta[@itemprop='logo']", "content") +
|
49
|
+
fetch_images("//head//meta[@property='og:image']", "content") +
|
50
|
+
fetch_images("//head//meta[@property='twitter:image:src']", "content") +
|
51
|
+
fetch_images("//head//link[@rel='image_src']", "href") +
|
52
|
+
fetch_images("//img", "src")
|
53
|
+
).flatten.compact.uniq
|
54
|
+
host_uri = Addressable::URI.parse(url)
|
55
|
+
imgs.map { |img|
|
56
|
+
Ogo::ImageInfo.new(url: fix_image_path(img, host_uri))
|
57
|
+
}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def metadata(fallback=false)
|
62
|
+
_meta = {
|
63
|
+
title: title,
|
64
|
+
description: description,
|
65
|
+
type: type,
|
66
|
+
image: nil
|
67
|
+
}
|
68
|
+
if image
|
69
|
+
_meta[:image] = {
|
70
|
+
url: image.url,
|
71
|
+
width: image.width,
|
72
|
+
height: image.height,
|
73
|
+
type: image.type
|
74
|
+
}
|
75
|
+
end
|
76
|
+
_meta
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def fix_image_path(img, host_uri)
|
82
|
+
return "http:#{img}" if img.start_with?('//')
|
83
|
+
return img if host_uri.host.nil?
|
84
|
+
if Addressable::URI.parse(img).host.nil?
|
85
|
+
host_uri.join(img).to_s
|
86
|
+
else
|
87
|
+
img
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def fetch_first_text
|
92
|
+
page.doc.xpath('//p').each do |p|
|
93
|
+
s = p.text.to_s.strip
|
94
|
+
return s if s.length > 20
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def fetch_images(xpath_str, attr)
|
99
|
+
page.doc.xpath(xpath_str).map do |tag|
|
100
|
+
tag.attribute(attr).to_s.strip
|
101
|
+
end.reject { |it| it.empty? }.uniq
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Ogo
|
2
|
+
module Parsers
|
3
|
+
class Opengraph < Ogo::Parsers::Base
|
4
|
+
|
5
|
+
def initialize(parseable, fallback=false)
|
6
|
+
@global_fallback = fallback
|
7
|
+
super parseable
|
8
|
+
end
|
9
|
+
|
10
|
+
def title(fallback=false)
|
11
|
+
if fallback
|
12
|
+
super
|
13
|
+
else
|
14
|
+
_val = find_meta('title')
|
15
|
+
(!_val.empty? && _val) ||
|
16
|
+
(@global_fallback && super) ||
|
17
|
+
''
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def description(fallback=false)
|
22
|
+
if fallback
|
23
|
+
super
|
24
|
+
else
|
25
|
+
_val = find_meta('description')
|
26
|
+
(!_val.empty? && _val) ||
|
27
|
+
(@global_fallback && super) ||
|
28
|
+
''
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def image(fallback=false)
|
33
|
+
if fallback
|
34
|
+
super
|
35
|
+
else
|
36
|
+
_val = find_meta('image')
|
37
|
+
if _val.empty?
|
38
|
+
(@global_fallback && super) || nil
|
39
|
+
else
|
40
|
+
host_uri = Addressable::URI.parse(url)
|
41
|
+
Ogo::ImageInfo.new(url: fix_image_path(_val, host_uri))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def type(fallback=false)
|
47
|
+
if fallback
|
48
|
+
super
|
49
|
+
else
|
50
|
+
_val = find_meta('type')
|
51
|
+
(!_val.empty? && _val) ||
|
52
|
+
(@global_fallback && super) ||
|
53
|
+
''
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def metadata(fallback=false)
|
58
|
+
_meta = super
|
59
|
+
if fallback
|
60
|
+
_meta[:fallback] = {
|
61
|
+
title: title(true),
|
62
|
+
description: description(true),
|
63
|
+
type: type(true),
|
64
|
+
image: nil
|
65
|
+
}
|
66
|
+
img = image(true)
|
67
|
+
if img
|
68
|
+
_meta[:fallback][:image] = {
|
69
|
+
url: img.url,
|
70
|
+
width: img.width,
|
71
|
+
height: img.height,
|
72
|
+
type: img.type
|
73
|
+
}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
_meta
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def find_meta(meta_type)
|
82
|
+
tag = page.doc.xpath('//head//meta').find { |it|
|
83
|
+
it.attribute('property').to_s == "og:#{meta_type}"
|
84
|
+
}
|
85
|
+
(tag && tag.attribute('content')).to_s.strip
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/lib/ogo/parsers.rb
ADDED
data/lib/ogo/version.rb
CHANGED
data/lib/ogo.rb
CHANGED
@@ -2,8 +2,11 @@ require 'addressable/uri'
|
|
2
2
|
require 'uri'
|
3
3
|
require 'nokogiri'
|
4
4
|
|
5
|
-
require_relative 'ogo/
|
5
|
+
require_relative 'ogo/page_source'
|
6
|
+
require_relative 'ogo/image_info'
|
7
|
+
require_relative 'ogo/parsers'
|
6
8
|
require_relative 'ogo/utils/redirect_follower'
|
9
|
+
require_relative 'ogo/version'
|
7
10
|
|
8
11
|
module Ogo
|
9
12
|
|
data/ogo.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.add_dependency 'nokogiri', '>= 1.6'
|
20
20
|
s.add_dependency 'addressable', '>= 2.4.0'
|
21
21
|
s.add_development_dependency 'rspec', '>= 3.0'
|
22
|
+
s.add_development_dependency 'fastimage'
|
22
23
|
s.add_development_dependency 'pry'
|
23
24
|
s.add_development_dependency 'byebug'
|
24
25
|
s.add_development_dependency 'rake'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ogo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gazay
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: fastimage
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: pry
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,7 +121,11 @@ files:
|
|
107
121
|
- README.md
|
108
122
|
- Rakefile
|
109
123
|
- lib/ogo.rb
|
110
|
-
- lib/ogo/
|
124
|
+
- lib/ogo/image_info.rb
|
125
|
+
- lib/ogo/page_source.rb
|
126
|
+
- lib/ogo/parsers.rb
|
127
|
+
- lib/ogo/parsers/base.rb
|
128
|
+
- lib/ogo/parsers/opengraph.rb
|
111
129
|
- lib/ogo/utils/redirect_follower.rb
|
112
130
|
- lib/ogo/version.rb
|
113
131
|
- ogo.gemspec
|
data/lib/ogo/opengraph.rb
DELETED
@@ -1,179 +0,0 @@
|
|
1
|
-
module Ogo
|
2
|
-
class Opengraph
|
3
|
-
|
4
|
-
attr_accessor :src, :url, :type, :title, :description,
|
5
|
-
:images, :metadata, :response, :body, :charset, :original_images,
|
6
|
-
:error
|
7
|
-
|
8
|
-
def initialize(src, options = {})
|
9
|
-
@src = src
|
10
|
-
@body = nil
|
11
|
-
@images = []
|
12
|
-
@metadata = {}
|
13
|
-
@charset = 'utf-8'
|
14
|
-
@error = nil
|
15
|
-
|
16
|
-
@_fallback = options[:fallback] || true
|
17
|
-
@_options = options
|
18
|
-
end
|
19
|
-
|
20
|
-
def parse
|
21
|
-
parse_opengraph(@_options)
|
22
|
-
load_fallback if @_fallback
|
23
|
-
check_images_path
|
24
|
-
self
|
25
|
-
end
|
26
|
-
|
27
|
-
def parse!
|
28
|
-
parse
|
29
|
-
error ? raise(error) : self
|
30
|
-
end
|
31
|
-
|
32
|
-
def inspect
|
33
|
-
str = "<Ogo::Opengraph:0x00#{'%x' % (self.object_id << 1)}\nurl=\"#{url}\",\nmetadata=#{metadata},\n"
|
34
|
-
str << "images=#{images},\ntype=\"#{type}\",\ntitle=\"#{title}\">"
|
35
|
-
str
|
36
|
-
end
|
37
|
-
|
38
|
-
def to_s
|
39
|
-
inspect
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def parse_opengraph(options = {})
|
45
|
-
begin
|
46
|
-
if src.include? '</html>'
|
47
|
-
self.body = src
|
48
|
-
else
|
49
|
-
resolved = Ogo::Utils::RedirectFollower.new(src, options).resolve
|
50
|
-
self.body = resolved.body
|
51
|
-
self.charset = resolved.charset if resolved.charset
|
52
|
-
end
|
53
|
-
rescue => e
|
54
|
-
self.title = self.url = src
|
55
|
-
self.error = e
|
56
|
-
return
|
57
|
-
end
|
58
|
-
|
59
|
-
if body
|
60
|
-
attrs_list = %w(title url type description)
|
61
|
-
doc = parse_html
|
62
|
-
doc.css('meta').each do |m|
|
63
|
-
if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
|
64
|
-
m_content = m.attribute('content').to_s.strip
|
65
|
-
metadata_name = m.attribute('property').to_s.gsub("og:", "")
|
66
|
-
self.metadata = add_metadata(metadata, metadata_name, m_content)
|
67
|
-
case metadata_name
|
68
|
-
when *attrs_list
|
69
|
-
self.instance_variable_set("@#{metadata_name}", m_content) unless m_content.empty?
|
70
|
-
when "image"
|
71
|
-
add_image(m_content)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def load_fallback
|
79
|
-
if body
|
80
|
-
doc = parse_html
|
81
|
-
|
82
|
-
if title.to_s.empty? && doc.xpath("//head//title").size > 0
|
83
|
-
self.title = doc.xpath("//head//title").first.text.to_s.strip
|
84
|
-
end
|
85
|
-
|
86
|
-
self.url = src if url.to_s.empty?
|
87
|
-
|
88
|
-
if description.to_s.empty? && description_meta = doc.xpath("//head//meta[@name='description']").first
|
89
|
-
self.description = description_meta.attribute("content").to_s.strip
|
90
|
-
end
|
91
|
-
|
92
|
-
if description.to_s.empty?
|
93
|
-
self.description = fetch_first_text(doc)
|
94
|
-
end
|
95
|
-
|
96
|
-
fetch_images(doc, "//head//link[@rel='image_src']", "href") if images.empty?
|
97
|
-
fetch_images(doc, "//img", "src") if images.empty?
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def check_images_path
|
102
|
-
self.original_images = images.dup
|
103
|
-
uri = Addressable::URI.parse(url)
|
104
|
-
imgs = images.dup
|
105
|
-
self.images = []
|
106
|
-
imgs.each do |img|
|
107
|
-
if Addressable::URI.parse(img).host.nil?
|
108
|
-
full_path = uri.join(img).to_s
|
109
|
-
add_image(full_path)
|
110
|
-
else
|
111
|
-
add_image(img)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def add_image(image_url)
|
117
|
-
unless images.include?(image_url) || image_url.to_s.empty?
|
118
|
-
self.images << image_url
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
def fetch_images(doc, xpath_str, attr)
|
123
|
-
doc.xpath(xpath_str).each do |link|
|
124
|
-
add_image(link.attribute(attr).to_s.strip)
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
def fetch_first_text(doc)
|
129
|
-
doc.xpath('//p').each do |p|
|
130
|
-
s = p.text.to_s.strip
|
131
|
-
return s if s.length > 20
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def add_metadata(metadata_container, path, content)
|
136
|
-
path_elements = path.split(':')
|
137
|
-
if path_elements.size > 1
|
138
|
-
current_element = path_elements.delete_at(0)
|
139
|
-
path = path_elements.join(':')
|
140
|
-
if metadata_container[current_element.to_sym]
|
141
|
-
path_pointer = metadata_container[current_element.to_sym].last
|
142
|
-
index_count = metadata_container[current_element.to_sym].size
|
143
|
-
metadata_container[current_element.to_sym][index_count - 1] = add_metadata(path_pointer, path, content)
|
144
|
-
metadata_container
|
145
|
-
else
|
146
|
-
metadata_container[current_element.to_sym] = []
|
147
|
-
metadata_container[current_element.to_sym] << add_metadata({}, path, content)
|
148
|
-
metadata_container
|
149
|
-
end
|
150
|
-
else
|
151
|
-
metadata_container[path.to_sym] ||= []
|
152
|
-
metadata_container[path.to_sym] << {'_value'.to_sym => content}
|
153
|
-
metadata_container
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def parse_html
|
158
|
-
unless charset
|
159
|
-
doc = Nokogiri.parse(body.scrub)
|
160
|
-
self.charset = guess_encoding(doc)
|
161
|
-
end
|
162
|
-
Nokogiri::HTML(body, nil, charset)
|
163
|
-
end
|
164
|
-
|
165
|
-
def guess_encoding(doc)
|
166
|
-
_charset = doc.xpath('//meta/@charset').first
|
167
|
-
return _charset.value.to_s if charset
|
168
|
-
|
169
|
-
_charset = doc.xpath('//meta').each do |m|
|
170
|
-
if m.attribute('http-equiv') && m.attribute('content') && m.attribute('http-equiv').value.casecmp('Content-Type')
|
171
|
-
return m.attribute('content').value.split('charset=').last.strip
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
'UTF-8'
|
176
|
-
end
|
177
|
-
|
178
|
-
end
|
179
|
-
end
|