url_scraper 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in url_scraper.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Genii Technologies Pvt Ltd. and Akshay Shinde
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,42 @@
1
+ = Url Scraper
2
+ Its a simple plugin for providing facebook style url scraper. Using url scraper you can convert any simple textarea into a smart textarea which auto detects url entered by users and fetches the content of the webpage and displays it on the page. This gem is built on top of gem 'opengraph' by Michael Bleigh. This gem uses the opengraph tags as well as uses meta, title and img tags (in case if opengraph tags are not present) to fetch information about a page.
3
+
4
+
5
+ == Installation
6
+
7
+ gem install url_scraper
8
+
9
+ paste this line in your application.js file
10
+
11
+ //= require jquery.scraper
12
+
13
+ paste this line in your application.css file
14
+
15
+ *= require scraper
16
+
17
+ Add following to your routes
18
+
19
+ match '/scrape_url' => "UrlScraper#scrape", :via => :post
20
+
21
+ == Using the plugin in a view
22
+
23
+ Create a text area or text field with id="scrape_url" and that's it.
24
+
25
+ == Advanced Usage
26
+
27
+ require 'url_scraper'
28
+
29
+ movie = UrlScraper.fetch('http://www.rottentomatoes.com/m/1217700-kick_ass/')
30
+
31
+ movie.title # => 'Kick-Ass'
32
+ movie.movie? # => true
33
+ movie.image # => 'http://images.rottentomatoes.com/images/movie/custom/00/1217700.jpg'
34
+
35
+ If you try to fetch Open Graph information for a URL that doesn't
36
+ have any, the <tt>fetch</tt> method will return values for title tag and other meta tags specified on the page.
37
+
38
+ The UrlScraper::Object that is returned is just a Hash with accessors
39
+ built into it, so you can examine what properties you've retrieved like so:
40
+
41
+ movie.keys # => ['type','image','title','url']
42
+
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,107 @@
1
+ $(function(){
2
+ $('.hidden').hide();
3
+
4
+ $('#scrape_url').keyup(function(event){
5
+ if (event.keyCode == 32){
6
+ url = linkify($(this).val());
7
+ if (url != null && url.length > 0)
8
+ scrapUrl(url.toString());
9
+ }
10
+ });
11
+
12
+ $('#scrape_url').bind('paste', function () {
13
+ var url;
14
+ setTimeout(function () {
15
+ url = linkify($('#scrape_url').val());
16
+ if (url != null && url.length > 0)
17
+ scrapUrl(url.toString());
18
+ }, 100);
19
+ });
20
+
21
+ function linkify(inputText) {
22
+ var replacedText, replacePattern1, replacePattern2, replacePattern3;
23
+
24
+ //URLs starting with http://, https://, or ftp://
25
+ replacePattern1 = /((https?|ftp):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/gim;
26
+ replacedText = inputText.replace(replacePattern1, '$1');
27
+
28
+ //URLs starting with "www." (without // before it, or it'd re-link the ones done above).
29
+ replacePattern2 = /(^|[^\/])(www\.[\S]+(\b|$))/gim;
30
+ replacedText = replacedText.replace(replacePattern2, 'http://$2');
31
+
32
+ url = replacedText.match(replacePattern1);
33
+
34
+ // console.log(replacedText);
35
+ // console.log(url);
36
+ return(url);
37
+ }
38
+
39
+ function scrapUrl(url){
40
+ $.ajax({
41
+ url: "/scrape_url",
42
+ data: {
43
+ url: url,
44
+ },
45
+ type: 'post',
46
+ success: function(data){
47
+ var container = document.createElement('div');
48
+ container.className = "scraped_content";
49
+ $('#scrape_url').after(container);
50
+
51
+ console.log(data);
52
+
53
+ //Check if a video is present
54
+ if(data.video != undefined)
55
+ alert("video pending");
56
+ else if(data.image.length > 0)
57
+ {
58
+ image_slider = document.createElement('div');
59
+ image_slider.setAttribute("id", "image_slider");
60
+ bjqs_ul = document.createElement('ul');
61
+ bjqs_ul.className = "bjqs"
62
+ $(image_slider).append(bjqs_ul);
63
+ $('.scraped_content').append(image_slider);
64
+
65
+ if(data.image instanceof Array) {
66
+ for(image in data.image){
67
+ if(data.image[image].match("//") == undefined)
68
+ $(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + url + data.image[image] + "><img></li>");
69
+ else
70
+ $(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + data.image[image] + "><img></li>");
71
+ }
72
+ } else if(data.image != null) {
73
+ if(data.image.match("//") == undefined)
74
+ $(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + url + data.image + "><img></li>");
75
+ else
76
+ $(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + data.image + "><img></li>");
77
+ }
78
+ init_slider();
79
+ }
80
+
81
+ //Add the title container
82
+ title = document.createElement('input');
83
+ title.setAttribute("type", "text");
84
+ title.setAttribute("value", data.title);
85
+ $('.scraped_content').append(title);
86
+
87
+ //Add the description container
88
+ description = document.createElement('textarea');
89
+ if(data.description.length > 0)
90
+ $(description).append(data.description);
91
+ else
92
+ description.setAttribute("placeholder", "This webpage doesn't provide any description. Go ahead and write your own.");
93
+ $('.scraped_content').append(description);
94
+ }
95
+ });
96
+ }
97
+
98
+ function init_slider(){
99
+ $('.scraped_content #image_slider').bjqs({
100
+ 'height' : 100,
101
+ 'width' : 170,
102
+ 'responsive' : true,
103
+ 'showmarkers' : false,
104
+ });
105
+ };
106
+
107
+ });
File without changes
@@ -0,0 +1,6 @@
1
+ roclass UrlScraperController < ApplicationController
2
+ def scrape
3
+ object = UrlScraper.fetch(params[:url])
4
+ render :json => object
5
+ end
6
+ end
@@ -0,0 +1,106 @@
1
+ require "url_scraper/version"
2
+ require 'hashie'
3
+ require 'nokogiri'
4
+ require 'restclient'
5
+ require 'logger'
6
+ require 'thor'
7
+
8
+ module UrlScraper
9
+ # Tell rails to load all assets
10
+ class Engine < Rails::Engine
11
+
12
+ end
13
+
14
+ class CLI < Thor
15
+
16
+ end
17
+
18
+ # Handles the url request
19
+
20
+ # Fetch Open Graph data from the specified URI. Makes an
21
+ # HTTP GET request and returns an UrlScraper::Object if there
22
+ # is data to be found or <tt>false</tt> if there isn't.
23
+ #
24
+ # Pass <tt>false</tt> for the second argument if you want to
25
+ # see invalid (i.e. missing a required attribute) data.
26
+
27
+ def self.fetch(uri, strict = true)
28
+ parse(RestClient.get(uri).body, strict)
29
+ rescue RestClient::Exception, SocketError
30
+ false
31
+ end
32
+
33
+ def self.parse(html, strict = true)
34
+ logger = Logger.new(STDOUT)
35
+ doc = Nokogiri::HTML.parse(html)
36
+ page = UrlScraper::Object.new
37
+ doc.css('meta').each do |m|
38
+ if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
39
+ page[$1.gsub('-','_')] = m.attribute('content').to_s
40
+ end
41
+ end
42
+
43
+ page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
44
+ if page.description.nil?
45
+ page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
46
+ end
47
+ if page.image.nil?
48
+ image_array = doc.css("img").take(3).collect{|img| img['src']}
49
+ page.image = image_array unless image_array.empty?
50
+ end
51
+ # return false if page.keys.empty?
52
+ # return false unless page.valid? if strict
53
+ page
54
+ # return doc
55
+ end
56
+
57
+ TYPES = {
58
+ 'activity' => %w(activity sport),
59
+ 'business' => %w(bar company cafe hotel restaurant),
60
+ 'group' => %w(cause sports_league sports_team),
61
+ 'organization' => %w(band government non_profit school university),
62
+ 'person' => %w(actor athlete author director musician politician public_figure),
63
+ 'place' => %w(city country landmark state_province),
64
+ 'product' => %w(album book drink food game movie product song tv_show),
65
+ 'website' => %w(blog website)
66
+ }
67
+
68
+ # The UrlScraper::Object is a Hash with method accessors for
69
+ # all detected Open Graph attributes.
70
+ class Object < Hashie::Mash
71
+ MANDATORY_ATTRIBUTES = %w(title type image url)
72
+
73
+ # The object type.
74
+ def type
75
+ self['type']
76
+ end
77
+
78
+ # The schema under which this particular object lies. May be any of
79
+ # the keys of the TYPES constant.
80
+ def schema
81
+ UrlScraper::TYPES.each_pair do |schema, types|
82
+ return schema if types.include?(self.type)
83
+ end
84
+ nil
85
+ end
86
+
87
+ UrlScraper::TYPES.values.flatten.each do |type|
88
+ define_method "#{type}?" do
89
+ self.type == type
90
+ end
91
+ end
92
+
93
+ UrlScraper::TYPES.keys.each do |scheme|
94
+ define_method "#{scheme}?" do
95
+ self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
96
+ end
97
+ end
98
+
99
+ # If the Open Graph information for this object doesn't contain
100
+ # the mandatory attributes, this will be <tt>false</tt>.
101
+ def valid?
102
+ MANDATORY_ATTRIBUTES.each{|a| return false unless self[a]}
103
+ true
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,3 @@
1
+ module UrlScraper
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "url_scraper"
8
+ spec.version = UrlScraper::VERSION
9
+ spec.authors = ["Super Engineer"]
10
+ spec.email = ["akshayshinde7@gmail.com"]
11
+ spec.description = %q{A simple plugin for extracting information from url entered by user (Something
12
+ like what facebook does). This gem is built on top of opengraph gem created by michael
13
+ bleigh.}
14
+ spec.summary = %q{A simple plugin for extracting information from url entered by user (Something
15
+ like what facebook does). This gem is built on top of opengraph gem created by michael
16
+ bleigh.}
17
+ spec.homepage = "http://github.com/super-engineer/url_scraper"
18
+ spec.license = "MIT"
19
+
20
+ spec.files = `git ls-files`.split($/)
21
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
22
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
23
+ spec.require_paths = ["lib"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.3"
26
+ spec.add_development_dependency "rake"
27
+
28
+ spec.add_dependency 'hashie'
29
+ spec.add_dependency 'nokogiri', '~> 1.5.9'
30
+ spec.add_dependency 'rest-client', '~> 1.6.7'
31
+ spec.add_dependency 'thor'
32
+ end
metadata ADDED
@@ -0,0 +1,157 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Super Engineer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-05-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: hashie
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 1.5.9
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 1.5.9
78
+ - !ruby/object:Gem::Dependency
79
+ name: rest-client
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: 1.6.7
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.6.7
94
+ - !ruby/object:Gem::Dependency
95
+ name: thor
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: ! "A simple plugin for extracting information from url entered by user
111
+ (Something\n like what facebook does). This gem is built on top of opengraph gem
112
+ created by michael\n bleigh."
113
+ email:
114
+ - akshayshinde7@gmail.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - .gitignore
120
+ - Gemfile
121
+ - LICENSE.txt
122
+ - README.rdoc
123
+ - Rakefile
124
+ - app/assets/javascripts/jquery.scraper.js
125
+ - app/assets/stylesheets/scraper.css
126
+ - app/controllers/url_scraper_controller.rb
127
+ - lib/url_scraper.rb
128
+ - lib/url_scraper/version.rb
129
+ - url_scraper.gemspec
130
+ homepage: http://github.com/super-engineer/url_scraper
131
+ licenses:
132
+ - MIT
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ required_rubygems_version: !ruby/object:Gem::Requirement
144
+ none: false
145
+ requirements:
146
+ - - ! '>='
147
+ - !ruby/object:Gem::Version
148
+ version: '0'
149
+ requirements: []
150
+ rubyforge_project:
151
+ rubygems_version: 1.8.24
152
+ signing_key:
153
+ specification_version: 3
154
+ summary: A simple plugin for extracting information from url entered by user (Something
155
+ like what facebook does). This gem is built on top of opengraph gem created by michael
156
+ bleigh.
157
+ test_files: []