links_processor 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +1 -0
- data/lib/links_processor.rb +190 -0
- data/lib/links_processor/version.rb +3 -0
- data/links_processor.gemspec +29 -0
- metadata +95 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,190 @@
|
|
1
|
+
# To change this template, choose Tools | Templates
|
2
|
+
# and open the template in the editor.
|
3
|
+
require 'net/http'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'oembed'
|
7
|
+
require 'image_size'
|
8
|
+
|
9
|
+
module LinksProcessor
|
10
|
+
class LinksProcessor
|
11
|
+
|
12
|
+
def initialize(type, source = nil)
|
13
|
+
@type = type
|
14
|
+
source &&= source.capitalize
|
15
|
+
@processor = OEmbed::Providers.const_get(source) if source && OEmbed::Providers.constants.include?(source)
|
16
|
+
@source = source
|
17
|
+
end
|
18
|
+
|
19
|
+
def process
|
20
|
+
self.send :"process_#{@type}"
|
21
|
+
end
|
22
|
+
|
23
|
+
def process_video(url, video_options)
|
24
|
+
video_options.merge!(:format => :json)
|
25
|
+
processed_video = @processor.get(url, video_options)
|
26
|
+
processed_video.uri = URI.parse processed_video.request_url
|
27
|
+
processed_video.sld = get_sld processed_video.uri.host
|
28
|
+
if @source == 'Youtube'
|
29
|
+
doc = Nokogiri::HTML::fragment processed_video.html
|
30
|
+
if doc.at_xpath('.//param[@name=\'movie\']')
|
31
|
+
doc.at_xpath('.//param[@name=\'movie\']')['value'] += '&autoplay=1'
|
32
|
+
doc.at_xpath('.//embed')['src'] += '&autoplay=1'
|
33
|
+
elsif doc.at_xpath('.//iframe')
|
34
|
+
doc.at_xpath('.//iframe')['src'] += '&autoplay=1'
|
35
|
+
end
|
36
|
+
processed_video.fields['html'] = doc.to_html
|
37
|
+
end
|
38
|
+
processed_video
|
39
|
+
end
|
40
|
+
|
41
|
+
def process_doc(url, doc_options)
|
42
|
+
doc_options.merge!(:format => :json)
|
43
|
+
processed_doc = @processor.get(url, doc_options)
|
44
|
+
processed_doc.uri = URI.parse processed_doc.request_url
|
45
|
+
processed_doc.sld = get_sld processed_doc.uri.host
|
46
|
+
doc = Nokogiri::HTML::fragment processed_doc.html
|
47
|
+
if @source == 'Scribd'
|
48
|
+
processed_doc.height = doc.at_xpath('.//object')['height']
|
49
|
+
elsif @source == 'Slideshare'
|
50
|
+
processed_doc.fields['html'] = doc.at_xpath('.//iframe').to_s
|
51
|
+
end
|
52
|
+
processed_doc
|
53
|
+
end
|
54
|
+
|
55
|
+
MAX_IMAGESIZE = '512K'
|
56
|
+
MIN_IMAGESIZE = '1K'
|
57
|
+
MAX_IMAGES = 10
|
58
|
+
ASPECT_RATIO_RANGE = 0.25..4
|
59
|
+
|
60
|
+
def process_link(url, link_options = nil)
|
61
|
+
uri = URI.parse url
|
62
|
+
uri_sld = get_sld uri.host
|
63
|
+
data = {:images => [], :title => nil, :description => nil}
|
64
|
+
doc = Nokogiri::HTML(open(uri.to_s))
|
65
|
+
|
66
|
+
data[:title] ||= (doc.at_xpath('/html/head/meta[@property=\'og:title\']')['content'] rescue nil)
|
67
|
+
data[:title] ||= (doc.at_xpath('/html/head/meta[@name=\'title\']')['content'] rescue nil)
|
68
|
+
data[:title] ||= (doc.xpath('/html/head/title').text rescue nil)
|
69
|
+
data[:description] = doc.at_xpath('/html/head/meta[@name=\'description\']')['content'] rescue nil
|
70
|
+
data[:sld] = uri_sld
|
71
|
+
data[:uri] = uri
|
72
|
+
image = doc.at_xpath('/html/head/meta[@property=\'og:image\']')['content'] rescue nil
|
73
|
+
if image
|
74
|
+
data[:images] << image
|
75
|
+
else
|
76
|
+
images = doc.xpath('/html/body//img')
|
77
|
+
unsized_images = []
|
78
|
+
|
79
|
+
# filter out unsized images
|
80
|
+
images = images.select do |img|
|
81
|
+
width,height = ['width','height'].map{|x| img[x] && img[x].chomp('px')}
|
82
|
+
valid_size = width && height && width =~ /\d+/ && height =~ /\d+/
|
83
|
+
unsized_images << img unless valid_size
|
84
|
+
valid_size
|
85
|
+
end
|
86
|
+
|
87
|
+
if images.empty?
|
88
|
+
images += update_images_sizes(unsized_images, uri)
|
89
|
+
end
|
90
|
+
|
91
|
+
images.each do |img|
|
92
|
+
break if data[:images].count > MAX_IMAGES
|
93
|
+
img_uri = URI.parse img['src'] rescue nil
|
94
|
+
|
95
|
+
# Skip if invalid image src or dimensions
|
96
|
+
next unless img_uri
|
97
|
+
|
98
|
+
width, height = img['width'].to_i, img['height'].to_i
|
99
|
+
|
100
|
+
# Skip if ads images
|
101
|
+
next if img_uri.to_s =~ /(ad|ads)\./i
|
102
|
+
|
103
|
+
# Skip if not matching image constraints
|
104
|
+
next if width == 0 || height == 0 || width < 50 || height < 50 || ASPECT_RATIO_RANGE.exclude?(width/height)
|
105
|
+
|
106
|
+
# make image url host and protocol same as page if the src is not a complete url
|
107
|
+
img_uri.path = uri.path if img_uri.relative? && !uri.host
|
108
|
+
img_uri.scheme = uri.scheme unless img_uri.scheme
|
109
|
+
img_uri.host = uri.host unless img_uri.host
|
110
|
+
|
111
|
+
img_sld = get_sld img_uri.host
|
112
|
+
|
113
|
+
# Skip if second level domain of images is not equal to page second level domain example static.example.com and www.example.com match
|
114
|
+
# as both have sld example.com
|
115
|
+
next unless img_sld == uri_sld unless img_sld
|
116
|
+
|
117
|
+
data[:images] << img_uri.to_s
|
118
|
+
end
|
119
|
+
end
|
120
|
+
data
|
121
|
+
end
|
122
|
+
|
123
|
+
def update_images_sizes(images, parent_uri)
|
124
|
+
sized_images = []
|
125
|
+
images.select do |img|
|
126
|
+
break if sized_images.size > 4
|
127
|
+
img_uri = URI.parse(img['src']) rescue nil
|
128
|
+
next unless img_uri
|
129
|
+
# Try to find if filename itself ends with size format such as 100x35 .. else a request is made
|
130
|
+
if match = /(\d{1,4})(x|X)(\d{1,4})$/.match(File.basename(img_uri.path, '.*'))
|
131
|
+
img['width'] = match[1]
|
132
|
+
img['height'] = match[3]
|
133
|
+
else
|
134
|
+
img_uri.scheme = parent_uri.scheme unless img_uri.scheme
|
135
|
+
img_uri.host = parent_uri.host unless img_uri.host
|
136
|
+
open(img_uri.to_s, "rb") do |fh|
|
137
|
+
img_size = ImageSize.new(fh.read)
|
138
|
+
img['width'] = img_size.get_width.to_s unless img_size.get_width.nil?
|
139
|
+
img['height'] = img_size.get_height.to_s unless img_size.get_height.nil?
|
140
|
+
end
|
141
|
+
end
|
142
|
+
sized_images << img if img['width'] && img['height']
|
143
|
+
end
|
144
|
+
sized_images
|
145
|
+
end
|
146
|
+
|
147
|
+
def validate_size
|
148
|
+
response = nil
|
149
|
+
Net::HTTP.start('www.biostat.wisc.edu', 80) {|http|
|
150
|
+
response = http.head('/bcg/categories/languages/ruby/ruby_logo.png')
|
151
|
+
}
|
152
|
+
p response['content-length']
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
|
157
|
+
def open_url
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
def get_sld(domain)
|
162
|
+
splits = domain.split('.')
|
163
|
+
splits.count > 2 ? splits[-2..-1].join('.') : domain
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Monkey Patches for OEMBED && OPEN-URI
|
168
|
+
module OEmbed::Formatter::JSON
|
169
|
+
private
|
170
|
+
def self.test_value
|
171
|
+
<<-JSON
|
172
|
+
{"version":1.0, "string":"test", "int":42,"html":"<i>Cool's</i>\\n the \\"word\\"\\u0021"}
|
173
|
+
JSON
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class OEmbed::Response
|
178
|
+
attr_accessor :uri, :sld, :height
|
179
|
+
end
|
180
|
+
|
181
|
+
# Monkey patch OPENURI to follow redirect URLS
|
182
|
+
def OpenURI.redirectable?(uri1, uri2) # :nodoc:
|
183
|
+
# This test is intended to forbid a redirection from http://... to
|
184
|
+
# file:///etc/passwd.
|
185
|
+
# However this is ad hoc. It should be extensible/configurable.
|
186
|
+
uri1.scheme.downcase == uri2.scheme.downcase ||
|
187
|
+
(/\A(?:http|ftp)\z/i =~ uri1.scheme && /\A(?:https?|ftp)\z/i =~ uri2.scheme)
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "links_processor/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "links_processor"
|
7
|
+
s.version = LinksProcessor::VERSION
|
8
|
+
s.authors = ["Shadab Ahmed"]
|
9
|
+
s.email = ["shadab.ansari@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Processes any link and gets meta info for it}
|
12
|
+
s.description = %q{Fetches meta and embed information for any link.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "links_processor"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_runtime_dependency "nokogiri"
|
22
|
+
s.add_runtime_dependency "open-uri"
|
23
|
+
s.add_runtime_dependency "oembed"
|
24
|
+
s.add_runtime_dependency "image_size"
|
25
|
+
|
26
|
+
# specify any dependencies here; for example:
|
27
|
+
# s.add_development_dependency "rspec"
|
28
|
+
# s.add_runtime_dependency "rest-client"
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: links_processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Shadab Ahmed
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-16 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &15446840 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *15446840
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: open-uri
|
27
|
+
requirement: &15441940 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *15441940
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: oembed
|
38
|
+
requirement: &15441400 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *15441400
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: image_size
|
49
|
+
requirement: &15440980 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *15440980
|
58
|
+
description: Fetches meta and embed information for any link.
|
59
|
+
email:
|
60
|
+
- shadab.ansari@gmail.com
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- .gitignore
|
66
|
+
- Gemfile
|
67
|
+
- Rakefile
|
68
|
+
- lib/links_processor.rb
|
69
|
+
- lib/links_processor/version.rb
|
70
|
+
- links_processor.gemspec
|
71
|
+
homepage: ''
|
72
|
+
licenses: []
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options: []
|
75
|
+
require_paths:
|
76
|
+
- lib
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
requirements: []
|
90
|
+
rubyforge_project: links_processor
|
91
|
+
rubygems_version: 1.8.10
|
92
|
+
signing_key:
|
93
|
+
specification_version: 3
|
94
|
+
summary: Processes any link and gets meta info for it
|
95
|
+
test_files: []
|