url_scraper 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +42 -0
- data/Rakefile +1 -0
- data/app/assets/javascripts/jquery.scraper.js +107 -0
- data/app/assets/stylesheets/scraper.css +0 -0
- data/app/controllers/url_scraper_controller.rb +6 -0
- data/lib/url_scraper.rb +106 -0
- data/lib/url_scraper/version.rb +3 -0
- data/url_scraper.gemspec +32 -0
- metadata +157 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Genii Technologies Pvt Ltd. and Akshay Shinde
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
= Url Scraper
|
2
|
+
Its a simple plugin for providing facebook style url scraper. Using url scraper you can convert any simple textarea into a smart textarea which auto detects url entered by users and fetches the content of the webpage and displays it on the page. This gem is built on top of gem 'opengraph' by Michael Bleigh. This gem uses the opengraph tags as well as uses meta, title and img tags (in case if opengraph tags are not present) to fetch information about a page.
|
3
|
+
|
4
|
+
|
5
|
+
== Installation
|
6
|
+
|
7
|
+
gem install url_scraper
|
8
|
+
|
9
|
+
paste this line in your application.js file
|
10
|
+
|
11
|
+
//= require jquery.scraper
|
12
|
+
|
13
|
+
paste this line in your application.css file
|
14
|
+
|
15
|
+
*= require scraper
|
16
|
+
|
17
|
+
Add following to your routes
|
18
|
+
|
19
|
+
match '/scrape_url' => "UrlScraper#scrape", :via => :post
|
20
|
+
|
21
|
+
== Using the plugin in a view
|
22
|
+
|
23
|
+
Create a text area or text field with id="scrape_url" and that's it.
|
24
|
+
|
25
|
+
== Advanced Usage
|
26
|
+
|
27
|
+
require 'url_scraper'
|
28
|
+
|
29
|
+
movie = UrlScraper.fetch('http://www.rottentomatoes.com/m/1217700-kick_ass/')
|
30
|
+
|
31
|
+
movie.title # => 'Kick-Ass'
|
32
|
+
movie.movie? # => true
|
33
|
+
movie.image # => 'http://images.rottentomatoes.com/images/movie/custom/00/1217700.jpg'
|
34
|
+
|
35
|
+
If you try to fetch Open Graph information for a URL that doesn't
|
36
|
+
have any, the <tt>fetch</tt> method will return values for title tag and other meta tags specified on the page.
|
37
|
+
|
38
|
+
The UrlScraper::Object that is returned is just a Hash with accessors
|
39
|
+
built into it, so you can examine what properties you've retrieved like so:
|
40
|
+
|
41
|
+
movie.keys # => ['type','image','title','url']
|
42
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,107 @@
|
|
1
|
+
$(function(){
|
2
|
+
$('.hidden').hide();
|
3
|
+
|
4
|
+
$('#scrape_url').keyup(function(event){
|
5
|
+
if (event.keyCode == 32){
|
6
|
+
url = linkify($(this).val());
|
7
|
+
if (url != null && url.length > 0)
|
8
|
+
scrapUrl(url.toString());
|
9
|
+
}
|
10
|
+
});
|
11
|
+
|
12
|
+
$('#scrape_url').bind('paste', function () {
|
13
|
+
var url;
|
14
|
+
setTimeout(function () {
|
15
|
+
url = linkify($('#scrape_url').val());
|
16
|
+
if (url != null && url.length > 0)
|
17
|
+
scrapUrl(url.toString());
|
18
|
+
}, 100);
|
19
|
+
});
|
20
|
+
|
21
|
+
function linkify(inputText) {
|
22
|
+
var replacedText, replacePattern1, replacePattern2, replacePattern3;
|
23
|
+
|
24
|
+
//URLs starting with http://, https://, or ftp://
|
25
|
+
replacePattern1 = /((https?|ftp):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/gim;
|
26
|
+
replacedText = inputText.replace(replacePattern1, '$1');
|
27
|
+
|
28
|
+
//URLs starting with "www." (without // before it, or it'd re-link the ones done above).
|
29
|
+
replacePattern2 = /(^|[^\/])(www\.[\S]+(\b|$))/gim;
|
30
|
+
replacedText = replacedText.replace(replacePattern2, 'http://$2');
|
31
|
+
|
32
|
+
url = replacedText.match(replacePattern1);
|
33
|
+
|
34
|
+
// console.log(replacedText);
|
35
|
+
// console.log(url);
|
36
|
+
return(url);
|
37
|
+
}
|
38
|
+
|
39
|
+
function scrapUrl(url){
|
40
|
+
$.ajax({
|
41
|
+
url: "/scrape_url",
|
42
|
+
data: {
|
43
|
+
url: url,
|
44
|
+
},
|
45
|
+
type: 'post',
|
46
|
+
success: function(data){
|
47
|
+
var container = document.createElement('div');
|
48
|
+
container.className = "scraped_content";
|
49
|
+
$('#scrape_url').after(container);
|
50
|
+
|
51
|
+
console.log(data);
|
52
|
+
|
53
|
+
//Check if a video is present
|
54
|
+
if(data.video != undefined)
|
55
|
+
alert("video pending");
|
56
|
+
else if(data.image.length > 0)
|
57
|
+
{
|
58
|
+
image_slider = document.createElement('div');
|
59
|
+
image_slider.setAttribute("id", "image_slider");
|
60
|
+
bjqs_ul = document.createElement('ul');
|
61
|
+
bjqs_ul.className = "bjqs"
|
62
|
+
$(image_slider).append(bjqs_ul);
|
63
|
+
$('.scraped_content').append(image_slider);
|
64
|
+
|
65
|
+
if(data.image instanceof Array) {
|
66
|
+
for(image in data.image){
|
67
|
+
if(data.image[image].match("//") == undefined)
|
68
|
+
$(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + url + data.image[image] + "><img></li>");
|
69
|
+
else
|
70
|
+
$(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + data.image[image] + "><img></li>");
|
71
|
+
}
|
72
|
+
} else if(data.image != null) {
|
73
|
+
if(data.image.match("//") == undefined)
|
74
|
+
$(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + url + data.image + "><img></li>");
|
75
|
+
else
|
76
|
+
$(".scraped_content #image_slider ul.bjqs").append("<li><img src=" + data.image + "><img></li>");
|
77
|
+
}
|
78
|
+
init_slider();
|
79
|
+
}
|
80
|
+
|
81
|
+
//Add the title container
|
82
|
+
title = document.createElement('input');
|
83
|
+
title.setAttribute("type", "text");
|
84
|
+
title.setAttribute("value", data.title);
|
85
|
+
$('.scraped_content').append(title);
|
86
|
+
|
87
|
+
//Add the description container
|
88
|
+
description = document.createElement('textarea');
|
89
|
+
if(data.description.length > 0)
|
90
|
+
$(description).append(data.description);
|
91
|
+
else
|
92
|
+
description.setAttribute("placeholder", "This webpage doesn't provide any description. Go ahead and write your own.");
|
93
|
+
$('.scraped_content').append(description);
|
94
|
+
}
|
95
|
+
});
|
96
|
+
}
|
97
|
+
|
98
|
+
function init_slider(){
|
99
|
+
$('.scraped_content #image_slider').bjqs({
|
100
|
+
'height' : 100,
|
101
|
+
'width' : 170,
|
102
|
+
'responsive' : true,
|
103
|
+
'showmarkers' : false,
|
104
|
+
});
|
105
|
+
};
|
106
|
+
|
107
|
+
});
|
File without changes
|
data/lib/url_scraper.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require "url_scraper/version"
|
2
|
+
require 'hashie'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'restclient'
|
5
|
+
require 'logger'
|
6
|
+
require 'thor'
|
7
|
+
|
8
|
+
module UrlScraper
|
9
|
+
# Tell rails to load all assets
|
10
|
+
class Engine < Rails::Engine
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
class CLI < Thor
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
# Handles the url request
|
19
|
+
|
20
|
+
# Fetch Open Graph data from the specified URI. Makes an
|
21
|
+
# HTTP GET request and returns an UrlScraper::Object if there
|
22
|
+
# is data to be found or <tt>false</tt> if there isn't.
|
23
|
+
#
|
24
|
+
# Pass <tt>false</tt> for the second argument if you want to
|
25
|
+
# see invalid (i.e. missing a required attribute) data.
|
26
|
+
|
27
|
+
def self.fetch(uri, strict = true)
|
28
|
+
parse(RestClient.get(uri).body, strict)
|
29
|
+
rescue RestClient::Exception, SocketError
|
30
|
+
false
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.parse(html, strict = true)
|
34
|
+
logger = Logger.new(STDOUT)
|
35
|
+
doc = Nokogiri::HTML.parse(html)
|
36
|
+
page = UrlScraper::Object.new
|
37
|
+
doc.css('meta').each do |m|
|
38
|
+
if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
|
39
|
+
page[$1.gsub('-','_')] = m.attribute('content').to_s
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
|
44
|
+
if page.description.nil?
|
45
|
+
page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
|
46
|
+
end
|
47
|
+
if page.image.nil?
|
48
|
+
image_array = doc.css("img").take(3).collect{|img| img['src']}
|
49
|
+
page.image = image_array unless image_array.empty?
|
50
|
+
end
|
51
|
+
# return false if page.keys.empty?
|
52
|
+
# return false unless page.valid? if strict
|
53
|
+
page
|
54
|
+
# return doc
|
55
|
+
end
|
56
|
+
|
57
|
+
TYPES = {
|
58
|
+
'activity' => %w(activity sport),
|
59
|
+
'business' => %w(bar company cafe hotel restaurant),
|
60
|
+
'group' => %w(cause sports_league sports_team),
|
61
|
+
'organization' => %w(band government non_profit school university),
|
62
|
+
'person' => %w(actor athlete author director musician politician public_figure),
|
63
|
+
'place' => %w(city country landmark state_province),
|
64
|
+
'product' => %w(album book drink food game movie product song tv_show),
|
65
|
+
'website' => %w(blog website)
|
66
|
+
}
|
67
|
+
|
68
|
+
# The UrlScraper::Object is a Hash with method accessors for
|
69
|
+
# all detected Open Graph attributes.
|
70
|
+
class Object < Hashie::Mash
|
71
|
+
MANDATORY_ATTRIBUTES = %w(title type image url)
|
72
|
+
|
73
|
+
# The object type.
|
74
|
+
def type
|
75
|
+
self['type']
|
76
|
+
end
|
77
|
+
|
78
|
+
# The schema under which this particular object lies. May be any of
|
79
|
+
# the keys of the TYPES constant.
|
80
|
+
def schema
|
81
|
+
UrlScraper::TYPES.each_pair do |schema, types|
|
82
|
+
return schema if types.include?(self.type)
|
83
|
+
end
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
UrlScraper::TYPES.values.flatten.each do |type|
|
88
|
+
define_method "#{type}?" do
|
89
|
+
self.type == type
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
UrlScraper::TYPES.keys.each do |scheme|
|
94
|
+
define_method "#{scheme}?" do
|
95
|
+
self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# If the Open Graph information for this object doesn't contain
|
100
|
+
# the mandatory attributes, this will be <tt>false</tt>.
|
101
|
+
def valid?
|
102
|
+
MANDATORY_ATTRIBUTES.each{|a| return false unless self[a]}
|
103
|
+
true
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/url_scraper.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'url_scraper/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "url_scraper"
|
8
|
+
spec.version = UrlScraper::VERSION
|
9
|
+
spec.authors = ["Super Engineer"]
|
10
|
+
spec.email = ["akshayshinde7@gmail.com"]
|
11
|
+
spec.description = %q{A simple plugin for extracting information from url entered by user (Something
|
12
|
+
like what facebook does). This gem is built on top of opengraph gem created by michael
|
13
|
+
bleigh.}
|
14
|
+
spec.summary = %q{A simple plugin for extracting information from url entered by user (Something
|
15
|
+
like what facebook does). This gem is built on top of opengraph gem created by michael
|
16
|
+
bleigh.}
|
17
|
+
spec.homepage = "http://github.com/super-engineer/url_scraper"
|
18
|
+
spec.license = "MIT"
|
19
|
+
|
20
|
+
spec.files = `git ls-files`.split($/)
|
21
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
26
|
+
spec.add_development_dependency "rake"
|
27
|
+
|
28
|
+
spec.add_dependency 'hashie'
|
29
|
+
spec.add_dependency 'nokogiri', '~> 1.5.9'
|
30
|
+
spec.add_dependency 'rest-client', '~> 1.6.7'
|
31
|
+
spec.add_dependency 'thor'
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Super Engineer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-05-06 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: hashie
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: nokogiri
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.5.9
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.5.9
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rest-client
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 1.6.7
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.6.7
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: thor
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: ! "A simple plugin for extracting information from url entered by user
|
111
|
+
(Something\n like what facebook does). This gem is built on top of opengraph gem
|
112
|
+
created by michael\n bleigh."
|
113
|
+
email:
|
114
|
+
- akshayshinde7@gmail.com
|
115
|
+
executables: []
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- .gitignore
|
120
|
+
- Gemfile
|
121
|
+
- LICENSE.txt
|
122
|
+
- README.rdoc
|
123
|
+
- Rakefile
|
124
|
+
- app/assets/javascripts/jquery.scraper.js
|
125
|
+
- app/assets/stylesheets/scraper.css
|
126
|
+
- app/controllers/url_scraper_controller.rb
|
127
|
+
- lib/url_scraper.rb
|
128
|
+
- lib/url_scraper/version.rb
|
129
|
+
- url_scraper.gemspec
|
130
|
+
homepage: http://github.com/super-engineer/url_scraper
|
131
|
+
licenses:
|
132
|
+
- MIT
|
133
|
+
post_install_message:
|
134
|
+
rdoc_options: []
|
135
|
+
require_paths:
|
136
|
+
- lib
|
137
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
|
+
none: false
|
145
|
+
requirements:
|
146
|
+
- - ! '>='
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
149
|
+
requirements: []
|
150
|
+
rubyforge_project:
|
151
|
+
rubygems_version: 1.8.24
|
152
|
+
signing_key:
|
153
|
+
specification_version: 3
|
154
|
+
summary: A simple plugin for extracting information from url entered by user (Something
|
155
|
+
like what facebook does). This gem is built on top of opengraph gem created by michael
|
156
|
+
bleigh.
|
157
|
+
test_files: []
|