linkser 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/linkser/object.rb +10 -0
- data/lib/linkser/objects/html.rb +121 -0
- data/lib/linkser/parser.rb +7 -8
- data/lib/linkser/version.rb +1 -1
- data/lib/linkser.rb +3 -2
- data/linkser.gemspec +1 -0
- metadata +27 -12
- data/lib/linkser/parser/html.rb +0 -73
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'image_spec'
|
5
|
+
require 'opengraph'
|
6
|
+
|
7
|
+
module Linkser
|
8
|
+
module Objects
|
9
|
+
class HTML < Linkser::Object
|
10
|
+
attr_reader :body, :nokogiri
|
11
|
+
attr_reader :title, :description, :images, :ogp
|
12
|
+
def initialize url, head, options={}
|
13
|
+
super url, head, options
|
14
|
+
end
|
15
|
+
|
16
|
+
def title
|
17
|
+
return @title unless @title.nil?
|
18
|
+
if ogp and ogp.title
|
19
|
+
@title = ogp.title
|
20
|
+
else
|
21
|
+
nokogiri.css('title').each do |title|
|
22
|
+
@title = title.text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@title
|
26
|
+
end
|
27
|
+
|
28
|
+
def body
|
29
|
+
return @body unless @body.nil?
|
30
|
+
@body = open(url)
|
31
|
+
end
|
32
|
+
|
33
|
+
def description
|
34
|
+
return @description unless @description.nil?
|
35
|
+
if ogp and ogp.description
|
36
|
+
@description = ogp.description
|
37
|
+
else
|
38
|
+
nokogiri.css('meta').each do |meta|
|
39
|
+
if meta.get_attribute("name").eql? "description"
|
40
|
+
@description = meta.get_attribute("content")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@description
|
45
|
+
end
|
46
|
+
|
47
|
+
def images
|
48
|
+
return @images unless @images.nil?
|
49
|
+
@images = Array.new
|
50
|
+
if ogp and ogp.image
|
51
|
+
begin
|
52
|
+
img_spec = ImageSpec.new(ogp.image)
|
53
|
+
if valid_img? img_spec.width.to_f, img_spec.height.to_f
|
54
|
+
@images << {:img => ogp.image, :width => img_spec.width, :height => img_spec.height}
|
55
|
+
end
|
56
|
+
rescue
|
57
|
+
end
|
58
|
+
end
|
59
|
+
nokogiri.css('img').each do |img|
|
60
|
+
img_src = img.get_attribute("src")
|
61
|
+
img_src = complete_url img_src, url
|
62
|
+
img_uri = URI.parse(img_src)
|
63
|
+
img_ext = File.extname(img_uri.path)
|
64
|
+
img_name = File.basename(img_uri.path,img_ext)
|
65
|
+
if [".jpg", ".jpeg", ".png"].include? img_ext
|
66
|
+
begin
|
67
|
+
img_spec = ImageSpec.new(img_src)
|
68
|
+
if valid_img? img_spec.width.to_f, img_spec.height.to_f
|
69
|
+
@images << {:img => img_src, :width => img_spec.width, :height => img_spec.height}
|
70
|
+
end
|
71
|
+
rescue
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
@images
|
76
|
+
end
|
77
|
+
|
78
|
+
def nokogiri
|
79
|
+
return @nokogiri unless @nokogiri.nil?
|
80
|
+
@nokogiri = Nokogiri::HTML(body)
|
81
|
+
end
|
82
|
+
|
83
|
+
def ogp
|
84
|
+
return @ogp unless @ogp.nil?
|
85
|
+
@ogp = OpenGraph::Object.new
|
86
|
+
nokogiri.css('meta').each do |m|
|
87
|
+
if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
|
88
|
+
@ogp[$1.gsub('-','_')] = m.attribute('content').to_s
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@ogp = false if @ogp.keys.empty?
|
92
|
+
@ogp = false unless @ogp.valid?
|
93
|
+
@ogp
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def complete_url src, url
|
99
|
+
uri = URI.parse(url)
|
100
|
+
base_url = "http://" + uri.host + (uri.port!=80 ? ":" + uri.port.to_s : "")
|
101
|
+
relative_url = "http://" + uri.host + (uri.port!=80 ? ":" + uri.port.to_s : "") + uri.path
|
102
|
+
if src.index("http://")==0
|
103
|
+
src
|
104
|
+
elsif src.index("/")==0
|
105
|
+
base_url + src
|
106
|
+
else
|
107
|
+
relative_url + src
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def valid_img? w, h
|
112
|
+
if w > 199 or w > 199
|
113
|
+
if ((w > 0 and h > 0 and ((w / h) < 3) and ((w / h) > 0.2)) or (w > 0 and h == 0 and w < 700) or (w == 0 and h > 0 and h < 700))
|
114
|
+
return true
|
115
|
+
end
|
116
|
+
end
|
117
|
+
false
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
data/lib/linkser/parser.rb
CHANGED
@@ -5,23 +5,21 @@ module Linkser
|
|
5
5
|
module Parser
|
6
6
|
def self.parse url, options={}
|
7
7
|
if !is_valid_url? url
|
8
|
-
|
8
|
+
raise "Invalid URL"
|
9
9
|
end
|
10
10
|
head = get_head url
|
11
11
|
case head.content_type
|
12
12
|
when "text/html"
|
13
|
-
Linkser::
|
13
|
+
Linkser::Objects::HTML.new url, head
|
14
14
|
else
|
15
15
|
raise "I have no idea on how to parse a '" + head.content_type + "'"
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
|
19
|
+
private
|
20
20
|
|
21
21
|
def self.get_head url, limit = 10
|
22
|
-
if (limit==0)
|
23
|
-
raise 'Too many HTTP redirects. URL was not reacheable within the HTTP redirects limit'
|
24
|
-
end
|
22
|
+
raise 'Too many HTTP redirects. URL was not reacheable within the HTTP redirects limit' if (limit==0)
|
25
23
|
uri = URI.parse url
|
26
24
|
http = Net::HTTP.start uri.host, uri.port
|
27
25
|
response = http.head uri.request_uri
|
@@ -33,7 +31,7 @@ module Linkser
|
|
33
31
|
warn "Redirecting to #{location}"
|
34
32
|
return get_head location, limit - 1
|
35
33
|
else
|
36
|
-
raise 'The HTTP
|
34
|
+
raise 'The HTTP request has a ' + response.code + ' code'
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
@@ -41,10 +39,11 @@ module Linkser
|
|
41
39
|
begin
|
42
40
|
uri = URI.parse(url)
|
43
41
|
if [:scheme, :host].any? { |i| uri.send(i).blank? }
|
44
|
-
raise
|
42
|
+
raise URI::InvalidURIError
|
45
43
|
end
|
46
44
|
return true
|
47
45
|
rescue URI::InvalidURIError => e
|
46
|
+
warn e.to_s
|
48
47
|
return false
|
49
48
|
end
|
50
49
|
end
|
data/lib/linkser/version.rb
CHANGED
data/lib/linkser.rb
CHANGED
data/linkser.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_runtime_dependency('nokogiri', '~> 1.4.2')
|
25
25
|
s.add_runtime_dependency('rmagick', '~> 2.13.1')
|
26
26
|
s.add_runtime_dependency('ruby-imagespec', "~> 0.2.0")
|
27
|
+
s.add_runtime_dependency('opengraph', "~> 0.0.4")
|
27
28
|
|
28
29
|
# Development Gem dependencies
|
29
30
|
#
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Eduardo Casanova
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-11-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-11-21 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: rake
|
@@ -81,9 +80,25 @@ dependencies:
|
|
81
80
|
type: :runtime
|
82
81
|
version_requirements: *id004
|
83
82
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
83
|
+
name: opengraph
|
85
84
|
prerelease: false
|
86
85
|
requirement: &id005 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ~>
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
hash: 23
|
91
|
+
segments:
|
92
|
+
- 0
|
93
|
+
- 0
|
94
|
+
- 4
|
95
|
+
version: 0.0.4
|
96
|
+
type: :runtime
|
97
|
+
version_requirements: *id005
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: ruby-debug
|
100
|
+
prerelease: false
|
101
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
87
102
|
none: false
|
88
103
|
requirements:
|
89
104
|
- - ">="
|
@@ -95,11 +110,11 @@ dependencies:
|
|
95
110
|
- 3
|
96
111
|
version: 0.10.3
|
97
112
|
type: :development
|
98
|
-
version_requirements: *
|
113
|
+
version_requirements: *id006
|
99
114
|
- !ruby/object:Gem::Dependency
|
100
115
|
name: rspec
|
101
116
|
prerelease: false
|
102
|
-
requirement: &
|
117
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
103
118
|
none: false
|
104
119
|
requirements:
|
105
120
|
- - ">="
|
@@ -111,7 +126,7 @@ dependencies:
|
|
111
126
|
- 0
|
112
127
|
version: 2.7.0
|
113
128
|
type: :development
|
114
|
-
version_requirements: *
|
129
|
+
version_requirements: *id007
|
115
130
|
description: Linkser is a link parser for Ruby. It gets an URI, tries to dereference it and returns the relevant information about the resource.
|
116
131
|
email:
|
117
132
|
- ecasanovac@gmail.com
|
@@ -130,13 +145,13 @@ files:
|
|
130
145
|
- README.textile
|
131
146
|
- Rakefile
|
132
147
|
- lib/linkser.rb
|
148
|
+
- lib/linkser/object.rb
|
149
|
+
- lib/linkser/objects/html.rb
|
133
150
|
- lib/linkser/parser.rb
|
134
|
-
- lib/linkser/parser/html.rb
|
135
151
|
- lib/linkser/version.rb
|
136
152
|
- linkser.gemspec
|
137
153
|
- spec/linkser_spec.rb
|
138
154
|
- spec/spec_helper.rb
|
139
|
-
has_rdoc: true
|
140
155
|
homepage: https://github.com/ging/linkser
|
141
156
|
licenses: []
|
142
157
|
|
@@ -166,7 +181,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
181
|
requirements: []
|
167
182
|
|
168
183
|
rubyforge_project:
|
169
|
-
rubygems_version: 1.
|
184
|
+
rubygems_version: 1.8.10
|
170
185
|
signing_key:
|
171
186
|
specification_version: 3
|
172
187
|
summary: A link parser for Ruby
|
data/lib/linkser/parser/html.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'net/http'
|
4
|
-
require 'image_spec'
|
5
|
-
|
6
|
-
module Linkser
|
7
|
-
module Parser
|
8
|
-
class HTML
|
9
|
-
def parse url, options={}
|
10
|
-
parsed_page = Hash.new
|
11
|
-
|
12
|
-
doc = Nokogiri::HTML(open(url))
|
13
|
-
|
14
|
-
doc.css('title').each do |title|
|
15
|
-
parsed_page.update({:title => title.text})
|
16
|
-
end
|
17
|
-
|
18
|
-
doc.css('meta').each do |meta|
|
19
|
-
if meta.get_attribute("name").eql? "description"
|
20
|
-
parsed_page.update({:description => meta.get_attribute("content")})
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
images = Array.new
|
25
|
-
|
26
|
-
doc.css('img').each do |img|
|
27
|
-
img_src = img.get_attribute("src")
|
28
|
-
img_src = get_complete_url img_src, url
|
29
|
-
img_uri = URI.parse(img_src)
|
30
|
-
img_ext = File.extname(img_uri.path)
|
31
|
-
img_name = File.basename(img_uri.path,img_ext)
|
32
|
-
if [".jpg", ".jpeg", ".png"].include? img_ext
|
33
|
-
begin
|
34
|
-
img_spec = ImageSpec.new(img_src)
|
35
|
-
w = img_spec.width.to_f
|
36
|
-
h = img_spec.height.to_f
|
37
|
-
if w > 199 or w > 199
|
38
|
-
if ((w > 0 and h > 0 and ((w / h) < 3) and ((w / h) > 0.2)) or (w > 0 and h == 0 and w < 700) or (w == 0 and h > 0 and h < 700)) and img_name.index("logo").nil?
|
39
|
-
image = {:img => img_src, :width => w.to_i, :height => h.to_i}
|
40
|
-
images << image
|
41
|
-
end
|
42
|
-
end
|
43
|
-
rescue
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
if images!=[]
|
49
|
-
parsed_page.update({:images => images})
|
50
|
-
end
|
51
|
-
|
52
|
-
return parsed_page
|
53
|
-
end
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
def get_complete_url src, url
|
58
|
-
uri = URI.parse(url)
|
59
|
-
base_url = "http://" + uri.host + (uri.port!=80 ? ":" + uri.port.to_s : "")
|
60
|
-
relative_url = "http://" + uri.host + (uri.port!=80 ? ":" + uri.port.to_s : "") + uri.path
|
61
|
-
if src.index("http://")==0
|
62
|
-
src = src
|
63
|
-
#stays the same
|
64
|
-
elsif src.index("/")==0
|
65
|
-
src = base_url + src
|
66
|
-
else
|
67
|
-
src = relative_url + src
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|