links_processor 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in links_processor.gemspec
4
+ gemspec
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,190 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+ require 'net/http'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'oembed'
7
+ require 'image_size'
8
+
9
+ module LinksProcessor
10
+ class LinksProcessor
11
+
12
+ def initialize(type, source = nil)
13
+ @type = type
14
+ source &&= source.capitalize
15
+ @processor = OEmbed::Providers.const_get(source) if source && OEmbed::Providers.constants.include?(source)
16
+ @source = source
17
+ end
18
+
19
+ def process
20
+ self.send :"process_#{@type}"
21
+ end
22
+
23
+ def process_video(url, video_options)
24
+ video_options.merge!(:format => :json)
25
+ processed_video = @processor.get(url, video_options)
26
+ processed_video.uri = URI.parse processed_video.request_url
27
+ processed_video.sld = get_sld processed_video.uri.host
28
+ if @source == 'Youtube'
29
+ doc = Nokogiri::HTML::fragment processed_video.html
30
+ if doc.at_xpath('.//param[@name=\'movie\']')
31
+ doc.at_xpath('.//param[@name=\'movie\']')['value'] += '&autoplay=1'
32
+ doc.at_xpath('.//embed')['src'] += '&autoplay=1'
33
+ elsif doc.at_xpath('.//iframe')
34
+ doc.at_xpath('.//iframe')['src'] += '&autoplay=1'
35
+ end
36
+ processed_video.fields['html'] = doc.to_html
37
+ end
38
+ processed_video
39
+ end
40
+
41
+ def process_doc(url, doc_options)
42
+ doc_options.merge!(:format => :json)
43
+ processed_doc = @processor.get(url, doc_options)
44
+ processed_doc.uri = URI.parse processed_doc.request_url
45
+ processed_doc.sld = get_sld processed_doc.uri.host
46
+ doc = Nokogiri::HTML::fragment processed_doc.html
47
+ if @source == 'Scribd'
48
+ processed_doc.height = doc.at_xpath('.//object')['height']
49
+ elsif @source == 'Slideshare'
50
+ processed_doc.fields['html'] = doc.at_xpath('.//iframe').to_s
51
+ end
52
+ processed_doc
53
+ end
54
+
55
+ MAX_IMAGESIZE = '512K'
56
+ MIN_IMAGESIZE = '1K'
57
+ MAX_IMAGES = 10
58
+ ASPECT_RATIO_RANGE = 0.25..4
59
+
60
+ def process_link(url, link_options = nil)
61
+ uri = URI.parse url
62
+ uri_sld = get_sld uri.host
63
+ data = {:images => [], :title => nil, :description => nil}
64
+ doc = Nokogiri::HTML(open(uri.to_s))
65
+
66
+ data[:title] ||= (doc.at_xpath('/html/head/meta[@property=\'og:title\']')['content'] rescue nil)
67
+ data[:title] ||= (doc.at_xpath('/html/head/meta[@name=\'title\']')['content'] rescue nil)
68
+ data[:title] ||= (doc.xpath('/html/head/title').text rescue nil)
69
+ data[:description] = doc.at_xpath('/html/head/meta[@name=\'description\']')['content'] rescue nil
70
+ data[:sld] = uri_sld
71
+ data[:uri] = uri
72
+ image = doc.at_xpath('/html/head/meta[@property=\'og:image\']')['content'] rescue nil
73
+ if image
74
+ data[:images] << image
75
+ else
76
+ images = doc.xpath('/html/body//img')
77
+ unsized_images = []
78
+
79
+ # filter out unsized images
80
+ images = images.select do |img|
81
+ width,height = ['width','height'].map{|x| img[x] && img[x].chomp('px')}
82
+ valid_size = width && height && width =~ /\d+/ && height =~ /\d+/
83
+ unsized_images << img unless valid_size
84
+ valid_size
85
+ end
86
+
87
+ if images.empty?
88
+ images += update_images_sizes(unsized_images, uri)
89
+ end
90
+
91
+ images.each do |img|
92
+ break if data[:images].count > MAX_IMAGES
93
+ img_uri = URI.parse img['src'] rescue nil
94
+
95
+ # Skip if invalid image src or dimensions
96
+ next unless img_uri
97
+
98
+ width, height = img['width'].to_i, img['height'].to_i
99
+
100
+ # Skip if ads images
101
+ next if img_uri.to_s =~ /(ad|ads)\./i
102
+
103
+ # Skip if not matching image constraints
104
+ next if width == 0 || height == 0 || width < 50 || height < 50 || ASPECT_RATIO_RANGE.exclude?(width/height)
105
+
106
+ # make image url host and protocol same as page if the src is not a complete url
107
+ img_uri.path = uri.path if img_uri.relative? && !uri.host
108
+ img_uri.scheme = uri.scheme unless img_uri.scheme
109
+ img_uri.host = uri.host unless img_uri.host
110
+
111
+ img_sld = get_sld img_uri.host
112
+
113
+ # Skip if second level domain of images is not equal to page second level domain example static.example.com and www.example.com match
114
+ # as both have sld example.com
115
+ next unless img_sld == uri_sld unless img_sld
116
+
117
+ data[:images] << img_uri.to_s
118
+ end
119
+ end
120
+ data
121
+ end
122
+
123
+ def update_images_sizes(images, parent_uri)
124
+ sized_images = []
125
+ images.select do |img|
126
+ break if sized_images.size > 4
127
+ img_uri = URI.parse(img['src']) rescue nil
128
+ next unless img_uri
129
+ # Try to find if filename itself ends with size format such as 100x35 .. else a request is made
130
+ if match = /(\d{1,4})(x|X)(\d{1,4})$/.match(File.basename(img_uri.path, '.*'))
131
+ img['width'] = match[1]
132
+ img['height'] = match[3]
133
+ else
134
+ img_uri.scheme = parent_uri.scheme unless img_uri.scheme
135
+ img_uri.host = parent_uri.host unless img_uri.host
136
+ open(img_uri.to_s, "rb") do |fh|
137
+ img_size = ImageSize.new(fh.read)
138
+ img['width'] = img_size.get_width.to_s unless img_size.get_width.nil?
139
+ img['height'] = img_size.get_height.to_s unless img_size.get_height.nil?
140
+ end
141
+ end
142
+ sized_images << img if img['width'] && img['height']
143
+ end
144
+ sized_images
145
+ end
146
+
147
+ def validate_size
148
+ response = nil
149
+ Net::HTTP.start('www.biostat.wisc.edu', 80) {|http|
150
+ response = http.head('/bcg/categories/languages/ruby/ruby_logo.png')
151
+ }
152
+ p response['content-length']
153
+ end
154
+
155
+ private
156
+
157
+ def open_url
158
+
159
+ end
160
+
161
+ def get_sld(domain)
162
+ splits = domain.split('.')
163
+ splits.count > 2 ? splits[-2..-1].join('.') : domain
164
+ end
165
+ end
166
+
167
+ # Monkey Patches for OEMBED && OPEN-URI
168
+ module OEmbed::Formatter::JSON
169
+ private
170
+ def self.test_value
171
+ <<-JSON
172
+ {"version":1.0, "string":"test", "int":42,"html":"<i>Cool's</i>\\n the \\"word\\"\\u0021"}
173
+ JSON
174
+ end
175
+ end
176
+
177
+ class OEmbed::Response
178
+ attr_accessor :uri, :sld, :height
179
+ end
180
+
181
+ # Monkey patch OPENURI to follow redirect URLS
182
+ def OpenURI.redirectable?(uri1, uri2) # :nodoc:
183
+ # This test is intended to forbid a redirection from http://... to
184
+ # file:///etc/passwd.
185
+ # However this is ad hoc. It should be extensible/configurable.
186
+ uri1.scheme.downcase == uri2.scheme.downcase ||
187
+ (/\A(?:http|ftp)\z/i =~ uri1.scheme && /\A(?:https?|ftp)\z/i =~ uri2.scheme)
188
+ end
189
+
190
+ end
@@ -0,0 +1,3 @@
1
+ module LinksProcessor
2
+ VERSION = "0.5.1"
3
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "links_processor/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "links_processor"
7
+ s.version = LinksProcessor::VERSION
8
+ s.authors = ["Shadab Ahmed"]
9
+ s.email = ["shadab.ansari@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Processes any link and gets meta info for it}
12
+ s.description = %q{Fetches meta and embed information for any link.}
13
+
14
+ s.rubyforge_project = "links_processor"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_runtime_dependency "nokogiri"
22
+ s.add_runtime_dependency "open-uri"
23
+ s.add_runtime_dependency "oembed"
24
+ s.add_runtime_dependency "image_size"
25
+
26
+ # specify any dependencies here; for example:
27
+ # s.add_development_dependency "rspec"
28
+ # s.add_runtime_dependency "rest-client"
29
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: links_processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Shadab Ahmed
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-16 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &15446840 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *15446840
25
+ - !ruby/object:Gem::Dependency
26
+ name: open-uri
27
+ requirement: &15441940 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *15441940
36
+ - !ruby/object:Gem::Dependency
37
+ name: oembed
38
+ requirement: &15441400 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *15441400
47
+ - !ruby/object:Gem::Dependency
48
+ name: image_size
49
+ requirement: &15440980 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *15440980
58
+ description: Fetches meta and embed information for any link.
59
+ email:
60
+ - shadab.ansari@gmail.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - Gemfile
67
+ - Rakefile
68
+ - lib/links_processor.rb
69
+ - lib/links_processor/version.rb
70
+ - links_processor.gemspec
71
+ homepage: ''
72
+ licenses: []
73
+ post_install_message:
74
+ rdoc_options: []
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project: links_processor
91
+ rubygems_version: 1.8.10
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Processes any link and gets meta info for it
95
+ test_files: []