links_processor 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in links_processor.gemspec
4
+ gemspec
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,190 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+ require 'net/http'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'oembed'
7
+ require 'image_size'
8
+
9
+ module LinksProcessor
10
+ class LinksProcessor
11
+
12
+ def initialize(type, source = nil)
13
+ @type = type
14
+ source &&= source.capitalize
15
+ @processor = OEmbed::Providers.const_get(source) if source && OEmbed::Providers.constants.include?(source)
16
+ @source = source
17
+ end
18
+
19
+ def process
20
+ self.send :"process_#{@type}"
21
+ end
22
+
23
+ def process_video(url, video_options)
24
+ video_options.merge!(:format => :json)
25
+ processed_video = @processor.get(url, video_options)
26
+ processed_video.uri = URI.parse processed_video.request_url
27
+ processed_video.sld = get_sld processed_video.uri.host
28
+ if @source == 'Youtube'
29
+ doc = Nokogiri::HTML::fragment processed_video.html
30
+ if doc.at_xpath('.//param[@name=\'movie\']')
31
+ doc.at_xpath('.//param[@name=\'movie\']')['value'] += '&autoplay=1'
32
+ doc.at_xpath('.//embed')['src'] += '&autoplay=1'
33
+ elsif doc.at_xpath('.//iframe')
34
+ doc.at_xpath('.//iframe')['src'] += '&autoplay=1'
35
+ end
36
+ processed_video.fields['html'] = doc.to_html
37
+ end
38
+ processed_video
39
+ end
40
+
41
+ def process_doc(url, doc_options)
42
+ doc_options.merge!(:format => :json)
43
+ processed_doc = @processor.get(url, doc_options)
44
+ processed_doc.uri = URI.parse processed_doc.request_url
45
+ processed_doc.sld = get_sld processed_doc.uri.host
46
+ doc = Nokogiri::HTML::fragment processed_doc.html
47
+ if @source == 'Scribd'
48
+ processed_doc.height = doc.at_xpath('.//object')['height']
49
+ elsif @source == 'Slideshare'
50
+ processed_doc.fields['html'] = doc.at_xpath('.//iframe').to_s
51
+ end
52
+ processed_doc
53
+ end
54
+
55
+ MAX_IMAGESIZE = '512K'
56
+ MIN_IMAGESIZE = '1K'
57
+ MAX_IMAGES = 10
58
+ ASPECT_RATIO_RANGE = 0.25..4
59
+
60
+ def process_link(url, link_options = nil)
61
+ uri = URI.parse url
62
+ uri_sld = get_sld uri.host
63
+ data = {:images => [], :title => nil, :description => nil}
64
+ doc = Nokogiri::HTML(open(uri.to_s))
65
+
66
+ data[:title] ||= (doc.at_xpath('/html/head/meta[@property=\'og:title\']')['content'] rescue nil)
67
+ data[:title] ||= (doc.at_xpath('/html/head/meta[@name=\'title\']')['content'] rescue nil)
68
+ data[:title] ||= (doc.xpath('/html/head/title').text rescue nil)
69
+ data[:description] = doc.at_xpath('/html/head/meta[@name=\'description\']')['content'] rescue nil
70
+ data[:sld] = uri_sld
71
+ data[:uri] = uri
72
+ image = doc.at_xpath('/html/head/meta[@property=\'og:image\']')['content'] rescue nil
73
+ if image
74
+ data[:images] << image
75
+ else
76
+ images = doc.xpath('/html/body//img')
77
+ unsized_images = []
78
+
79
+ # filter out unsized images
80
+ images = images.select do |img|
81
+ width,height = ['width','height'].map{|x| img[x] && img[x].chomp('px')}
82
+ valid_size = width && height && width =~ /\d+/ && height =~ /\d+/
83
+ unsized_images << img unless valid_size
84
+ valid_size
85
+ end
86
+
87
+ if images.empty?
88
+ images += update_images_sizes(unsized_images, uri)
89
+ end
90
+
91
+ images.each do |img|
92
+ break if data[:images].count > MAX_IMAGES
93
+ img_uri = URI.parse img['src'] rescue nil
94
+
95
+ # Skip if invalid image src or dimensions
96
+ next unless img_uri
97
+
98
+ width, height = img['width'].to_i, img['height'].to_i
99
+
100
+ # Skip if ads images
101
+ next if img_uri.to_s =~ /(ad|ads)\./i
102
+
103
+ # Skip if not matching image constraints
104
+ next if width == 0 || height == 0 || width < 50 || height < 50 || ASPECT_RATIO_RANGE.exclude?(width/height)
105
+
106
+ # make image url host and protocol same as page if the src is not a complete url
107
+ img_uri.path = uri.path if img_uri.relative? && !uri.host
108
+ img_uri.scheme = uri.scheme unless img_uri.scheme
109
+ img_uri.host = uri.host unless img_uri.host
110
+
111
+ img_sld = get_sld img_uri.host
112
+
113
+ # Skip if second level domain of images is not equal to page second level domain example static.example.com and www.example.com match
114
+ # as both have sld example.com
115
+ next unless img_sld == uri_sld unless img_sld
116
+
117
+ data[:images] << img_uri.to_s
118
+ end
119
+ end
120
+ data
121
+ end
122
+
123
+ def update_images_sizes(images, parent_uri)
124
+ sized_images = []
125
+ images.select do |img|
126
+ break if sized_images.size > 4
127
+ img_uri = URI.parse(img['src']) rescue nil
128
+ next unless img_uri
129
+ # Try to find if filename itself ends with size format such as 100x35 .. else a request is made
130
+ if match = /(\d{1,4})(x|X)(\d{1,4})$/.match(File.basename(img_uri.path, '.*'))
131
+ img['width'] = match[1]
132
+ img['height'] = match[3]
133
+ else
134
+ img_uri.scheme = parent_uri.scheme unless img_uri.scheme
135
+ img_uri.host = parent_uri.host unless img_uri.host
136
+ open(img_uri.to_s, "rb") do |fh|
137
+ img_size = ImageSize.new(fh.read)
138
+ img['width'] = img_size.get_width.to_s unless img_size.get_width.nil?
139
+ img['height'] = img_size.get_height.to_s unless img_size.get_height.nil?
140
+ end
141
+ end
142
+ sized_images << img if img['width'] && img['height']
143
+ end
144
+ sized_images
145
+ end
146
+
147
+ def validate_size
148
+ response = nil
149
+ Net::HTTP.start('www.biostat.wisc.edu', 80) {|http|
150
+ response = http.head('/bcg/categories/languages/ruby/ruby_logo.png')
151
+ }
152
+ p response['content-length']
153
+ end
154
+
155
+ private
156
+
157
+ def open_url
158
+
159
+ end
160
+
161
+ def get_sld(domain)
162
+ splits = domain.split('.')
163
+ splits.count > 2 ? splits[-2..-1].join('.') : domain
164
+ end
165
+ end
166
+
167
+ # Monkey Patches for OEMBED && OPEN-URI
168
+ module OEmbed::Formatter::JSON
169
+ private
170
+ def self.test_value
171
+ <<-JSON
172
+ {"version":1.0, "string":"test", "int":42,"html":"<i>Cool's</i>\\n the \\"word\\"\\u0021"}
173
+ JSON
174
+ end
175
+ end
176
+
177
+ class OEmbed::Response
178
+ attr_accessor :uri, :sld, :height
179
+ end
180
+
181
+ # Monkey patch OPENURI to follow redirect URLS
182
+ def OpenURI.redirectable?(uri1, uri2) # :nodoc:
183
+ # This test is intended to forbid a redirection from http://... to
184
+ # file:///etc/passwd.
185
+ # However this is ad hoc. It should be extensible/configurable.
186
+ uri1.scheme.downcase == uri2.scheme.downcase ||
187
+ (/\A(?:http|ftp)\z/i =~ uri1.scheme && /\A(?:https?|ftp)\z/i =~ uri2.scheme)
188
+ end
189
+
190
+ end
@@ -0,0 +1,3 @@
1
+ module LinksProcessor
2
+ VERSION = "0.5.1"
3
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "links_processor/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "links_processor"
7
+ s.version = LinksProcessor::VERSION
8
+ s.authors = ["Shadab Ahmed"]
9
+ s.email = ["shadab.ansari@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Processes any link and gets meta info for it}
12
+ s.description = %q{Fetches meta and embed information for any link.}
13
+
14
+ s.rubyforge_project = "links_processor"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_runtime_dependency "nokogiri"
22
+ s.add_runtime_dependency "open-uri"
23
+ s.add_runtime_dependency "oembed"
24
+ s.add_runtime_dependency "image_size"
25
+
26
+ # specify any dependencies here; for example:
27
+ # s.add_development_dependency "rspec"
28
+ # s.add_runtime_dependency "rest-client"
29
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: links_processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Shadab Ahmed
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-16 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &15446840 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *15446840
25
+ - !ruby/object:Gem::Dependency
26
+ name: open-uri
27
+ requirement: &15441940 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *15441940
36
+ - !ruby/object:Gem::Dependency
37
+ name: oembed
38
+ requirement: &15441400 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *15441400
47
+ - !ruby/object:Gem::Dependency
48
+ name: image_size
49
+ requirement: &15440980 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *15440980
58
+ description: Fetches meta and embed information for any link.
59
+ email:
60
+ - shadab.ansari@gmail.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - Gemfile
67
+ - Rakefile
68
+ - lib/links_processor.rb
69
+ - lib/links_processor/version.rb
70
+ - links_processor.gemspec
71
+ homepage: ''
72
+ licenses: []
73
+ post_install_message:
74
+ rdoc_options: []
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project: links_processor
91
+ rubygems_version: 1.8.10
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Processes any link and gets meta info for it
95
+ test_files: []