url_scrubber 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +33 -0
- data/.rvmrc +48 -0
- data/Gemfile +4 -0
- data/Guardfile +14 -0
- data/README.md +29 -0
- data/Rakefile +10 -0
- data/lib/url_scrubber/version.rb +3 -0
- data/lib/url_scrubber.rb +172 -0
- data/spec/url_scrubber_spec.rb +345 -0
- data/url_scrubber.gemspec +24 -0
- metadata +114 -0
data/.gitignore
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
**.orig
|
2
|
+
*.gem
|
3
|
+
*.rbc
|
4
|
+
*.sassc
|
5
|
+
.bundle
|
6
|
+
.config
|
7
|
+
.DS_Store
|
8
|
+
.rspec
|
9
|
+
.sass-cache
|
10
|
+
.yardoc
|
11
|
+
/.bundle
|
12
|
+
/coverage/
|
13
|
+
/db/*.sqlite3
|
14
|
+
/log/*
|
15
|
+
/public/system/*
|
16
|
+
/spec/tmp/*
|
17
|
+
/tmp/*
|
18
|
+
/vendor/bundle
|
19
|
+
_yardoc
|
20
|
+
capybara-*.html
|
21
|
+
coverage
|
22
|
+
doc/
|
23
|
+
Gemfile.lock
|
24
|
+
InstalledFiles
|
25
|
+
lib/bundler/man
|
26
|
+
pickle-email-*.html
|
27
|
+
pkg
|
28
|
+
rdoc
|
29
|
+
rerun.txt
|
30
|
+
spec/reports
|
31
|
+
test/tmp
|
32
|
+
test/version_tmp
|
33
|
+
tmp
|
data/.rvmrc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3-p194@url_scrubber"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.16.19 (stable)" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
else
|
29
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
30
|
+
rvm --create "$environment_id" || {
|
31
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
32
|
+
return 1
|
33
|
+
}
|
34
|
+
fi
|
35
|
+
|
36
|
+
# If you use bundler, this might be useful to you:
|
37
|
+
# if [[ -s Gemfile ]] && {
|
38
|
+
# ! builtin command -v bundle >/dev/null ||
|
39
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
40
|
+
# }
|
41
|
+
# then
|
42
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
43
|
+
# gem install bundler
|
44
|
+
# fi
|
45
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
46
|
+
# then
|
47
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
48
|
+
# fi
|
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'bundler' do
|
5
|
+
watch('Gemfile')
|
6
|
+
# Uncomment next line if Gemfile contain `gemspec' command
|
7
|
+
# watch(/^.+\.gemspec/)
|
8
|
+
end
|
9
|
+
|
10
|
+
guard 'rspec', :version => 2, :cli => '-c' do
|
11
|
+
watch(%r{^spec/.+_spec\.rb$})
|
12
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
13
|
+
watch('spec/spec_helper.rb') { "spec" }
|
14
|
+
end
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# UrlScrubber
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'url_scrubber'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install url_scrubber
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/url_scrubber.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
require "url_scrubber/version"
|
2
|
+
|
3
|
+
module UrlScrubber
|
4
|
+
def self.scrub(url)
|
5
|
+
url = url.clone # don't modify the original argument
|
6
|
+
m = url.match(/(https?:\/\/\S*)/i)
|
7
|
+
return nil unless m
|
8
|
+
|
9
|
+
url = m[1]
|
10
|
+
url.sub!(/^https/, 'http')
|
11
|
+
url.sub!(/\/+$/, '')
|
12
|
+
url.sub!('#!/', '')
|
13
|
+
url = downcase_domain(url)
|
14
|
+
remove_www!(url)
|
15
|
+
drop_anchor!(special_cases(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.service_of(url)
|
19
|
+
domain_match = url.match(%r{https?://([^/]+)})
|
20
|
+
|
21
|
+
if domain_match
|
22
|
+
domain = domain_match[1]
|
23
|
+
|
24
|
+
case domain
|
25
|
+
when /\byoutube\.com$/ then return :youtube
|
26
|
+
when /\btwitter\.com$/ then return :twitter
|
27
|
+
when /\bfacebook\.com$/ then return :facebook
|
28
|
+
when /\blinkedin\.com$/ then return :linkedin
|
29
|
+
when /\bplus\.google\.com$/ then return :google
|
30
|
+
when /\bslideshare\.net$/ then return :slideshare
|
31
|
+
when /\bflickr\.com$/ then return :flickr
|
32
|
+
when /\bpinterest\.com$/ then return :pinterest
|
33
|
+
when /\bvimeo\.com$/ then return :vimeo
|
34
|
+
when /\byelp\.com$/ then return :yelp
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
:other
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.ideal_form?(url)
|
42
|
+
url = scrub(url)
|
43
|
+
return false unless url
|
44
|
+
|
45
|
+
case service_of(url)
|
46
|
+
when :youtube
|
47
|
+
!!url.match(%r{^http://youtube\.com/[\w_-]+$})
|
48
|
+
when :twitter
|
49
|
+
!!url.match(%r{^http://twitter\.com/[\w_]+$})
|
50
|
+
when :facebook
|
51
|
+
!!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$})
|
52
|
+
when :linkedin
|
53
|
+
!!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$})
|
54
|
+
when :google
|
55
|
+
!!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$})
|
56
|
+
when :slideshare
|
57
|
+
!!url.match(%r{^http://slideshare\.net/[\w_-]+$})
|
58
|
+
when :flickr
|
59
|
+
!!url.match(%r{^http://flickr\.com/[\w_-]+$})
|
60
|
+
when :pinterest
|
61
|
+
!!url.match(%r{^http://pinterest\.com/[\w_-]+$})
|
62
|
+
when :yelp
|
63
|
+
!!url.match(%r{^http://yelp\.com/[\w_-]+$})
|
64
|
+
when :vimeo
|
65
|
+
!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})
|
66
|
+
else
|
67
|
+
true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def self.downcase_domain(url)
|
74
|
+
domain_match = url.match(%r{http://[^/]+}i)
|
75
|
+
domain_match[0].downcase + domain_match.post_match
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.special_cases(url)
|
79
|
+
case service_of(url)
|
80
|
+
when :youtube then return youtube(url)
|
81
|
+
when :twitter then return twitter(url)
|
82
|
+
when :facebook then return facebook(url)
|
83
|
+
when :linkedin then return linkedin(url)
|
84
|
+
when :google then return google_plus(url)
|
85
|
+
when :flickr then return flickr(url)
|
86
|
+
when :pinterest then return pinterest(url)
|
87
|
+
when :yelp then return yelp(url)
|
88
|
+
end
|
89
|
+
|
90
|
+
url
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.remove_www!(url)
|
94
|
+
url.sub!(%r{://www\d*\.}, '://')
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.drop_url_query!(url)
|
99
|
+
url.sub!(/\?.*$/, '')
|
100
|
+
url
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.drop_anchor!(url)
|
104
|
+
url.sub!(/#.*$/, '')
|
105
|
+
url
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.youtube(url)
|
109
|
+
url.sub!('youtube.com/user/', 'youtube.com/')
|
110
|
+
url.sub!('youtube.com/profile?user=', 'youtube.com/')
|
111
|
+
url
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.twitter(url)
|
115
|
+
url.sub!('twitter.com/@', 'twitter.com/')
|
116
|
+
|
117
|
+
status_match = url.match(%r{(twitter\.com/[^/]+)/statuses/\d+})
|
118
|
+
if status_match
|
119
|
+
url = "http://#{status_match[1]}"
|
120
|
+
end
|
121
|
+
|
122
|
+
search_match = url.match(%r{twitter\.com/search(?:/realtime)?(?:/|\?q=)(?:@|%40)(\S*)$})
|
123
|
+
if search_match
|
124
|
+
url = "http://twitter.com/#{search_match[1]}"
|
125
|
+
end
|
126
|
+
|
127
|
+
url
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.facebook(url)
|
131
|
+
if url.match("/media/albums") || url.match("/media/set")
|
132
|
+
url = url.match('\&') ? url.split('&',2)[0] : url
|
133
|
+
else
|
134
|
+
url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
|
135
|
+
url = drop_url_query!(url)
|
136
|
+
end
|
137
|
+
url
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.linkedin(url)
|
141
|
+
|
142
|
+
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
143
|
+
drop_url_query!(url) if !!url.match(%r{com/company/})
|
144
|
+
url
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.google_plus(url)
|
148
|
+
url.sub!('com/u/0/b/', 'com/')
|
149
|
+
url.sub!('com/u/0/', 'com/')
|
150
|
+
url.sub!('com/b/', 'com/')
|
151
|
+
|
152
|
+
path_match = url.match(/^http:\/\/plus\.google\.com\/([^\/]+)/)
|
153
|
+
return url unless path_match
|
154
|
+
|
155
|
+
"http://plus.google.com/#{path_match[1]}"
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.flickr(url)
|
159
|
+
user_match = url.match(%r{flickr\.com/(photos/|people/)?([^/]+)})
|
160
|
+
return url unless user_match
|
161
|
+
|
162
|
+
"http://flickr.com/#{user_match[2]}"
|
163
|
+
end
|
164
|
+
|
165
|
+
def self.pinterest(url)
|
166
|
+
url
|
167
|
+
end
|
168
|
+
|
169
|
+
def self.yelp(url)
|
170
|
+
url
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,345 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'url_scrubber'
|
3
|
+
|
4
|
+
describe UrlScrubber do
|
5
|
+
describe '.scrub' do
|
6
|
+
it 'should downcase all domain stuff' do
|
7
|
+
UrlScrubber.scrub("http://ExAmple.COM").should eq('http://example.com')
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should remove whitespace and stray characters around an url' do
|
11
|
+
UrlScrubber.scrub("d http://example.com\t").should eq("http://example.com")
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should turn an https url into http' do
|
15
|
+
UrlScrubber.scrub('https://example.com').should eq('http://example.com')
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should remove trailing slashes' do
|
19
|
+
UrlScrubber.scrub('http://example.com/').should eq('http://example.com')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should remove #!/' do
|
23
|
+
UrlScrubber.scrub('http://example.com/#!/page').should eq('http://example.com/page')
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should drop #anchors' do
|
27
|
+
UrlScrubber.scrub('http://example.com/page#query:example+pages').should eq('http://example.com/page')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should drop www' do
|
31
|
+
UrlScrubber.scrub('http://www.example.com/page').should eq('http://example.com/page')
|
32
|
+
end
|
33
|
+
|
34
|
+
describe 'with youtube urls' do
|
35
|
+
it 'should drop /user/ if it exists' do
|
36
|
+
UrlScrubber.scrub('http://youtube.com/user/absolutely').should eq('http://youtube.com/absolutely')
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should handle /profile?user= urls' do
|
40
|
+
UrlScrubber.scrub('http://youtube.com/profile?user=absolutely').should eq('http://youtube.com/absolutely')
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should not kick in for a non-youtube url' do
|
44
|
+
UrlScrubber.scrub('http://example.com/user/absolutely').should_not eq('http://example.com/absolutely')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'with twitter urls' do
|
49
|
+
it 'should drop @ from in front of username' do
|
50
|
+
UrlScrubber.scrub('http://twitter.com/@absolutely').should eq('http://twitter.com/absolutely')
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should not kick in for a non-twitter url' do
|
54
|
+
UrlScrubber.scrub('http://example.com/@absolutely').should_not eq('http://example.com/absolutely')
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should transform /search/realtime/%40username into that username' do
|
58
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime/%40absolutely').should eq('http://twitter.com/absolutely')
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should handle similar search urls' do
|
62
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime/@absolutely').should eq('http://twitter.com/absolutely')
|
63
|
+
UrlScrubber.scrub('http://twitter.com/search/%40absolutely').should eq('http://twitter.com/absolutely')
|
64
|
+
UrlScrubber.scrub('http://twitter.com/search/@absolutely').should eq('http://twitter.com/absolutely')
|
65
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime?q=%40absolutely').should eq('http://twitter.com/absolutely')
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should transform user statuses into that user's profile" do
|
69
|
+
UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe 'with facebook urls' do
|
74
|
+
it 'should drop /home.php?#!/ from the beginning of the path' do
|
75
|
+
UrlScrubber.scrub('http://facebook.com/home.php?#!/person.name').should eq('http://facebook.com/person.name')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should drop www. from the beginning' do
|
79
|
+
UrlScrubber.scrub('http://www.facebook.com/person.name').should eq('http://facebook.com/person.name')
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should drop an url query' do
|
83
|
+
UrlScrubber.scrub('http://facebook.com/person.name?ref=pb').should eq('http://facebook.com/person.name')
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'should not kick in for a non-facebook url' do
|
87
|
+
UrlScrubber.scrub('http://example.com/home.php?#!/person.name').should_not eq('http://example.com/person.name')
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
describe 'with linkedin urls' do
|
92
|
+
it 'should change /companies/ to /company/' do
|
93
|
+
UrlScrubber.scrub('http://linkedin.com/companies/1337').should eq('http://linkedin.com/company/1337')
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'should drop www. from the beginning' do
|
97
|
+
UrlScrubber.scrub('http://www.linkedin.com/company/1337').should eq('http://linkedin.com/company/1337')
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should drop query parameters from the end of a company url' do
|
101
|
+
UrlScrubber.scrub('http://linkedin.com/company/1337?trk=tyah').should eq('http://linkedin.com/company/1337')
|
102
|
+
end
|
103
|
+
|
104
|
+
it 'should not drop query parameters a profile view url' do
|
105
|
+
UrlScrubber.scrub('http://www.linkedin.com/profile/view?id=12341324').should eq('http://linkedin.com/profile/view?id=12341324')
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'should not kick in for a non-linkedin url' do
|
109
|
+
UrlScrubber.scrub('http://example.com/companies/1337').should_not eq('http://example.com/company/1337')
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe 'with google plus urls' do
|
114
|
+
it 'should drop anything after the first significant path component' do
|
115
|
+
UrlScrubber.scrub('http://plus.google.com/+SomeName/posts').should eq('http://plus.google.com/+SomeName')
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'should drop u/0/b/ from the beginning of the path' do
|
119
|
+
UrlScrubber.scrub(
|
120
|
+
'http://plus.google.com/u/0/b/111111111111111111111'
|
121
|
+
).should eq(
|
122
|
+
'http://plus.google.com/111111111111111111111'
|
123
|
+
)
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should drop u/0/ from the beginning of the path' do
|
127
|
+
UrlScrubber.scrub(
|
128
|
+
'https://plus.google.com/u/0/5432123454135/posts'
|
129
|
+
).should eq(
|
130
|
+
'http://plus.google.com/5432123454135'
|
131
|
+
)
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'should not kick in for a non-google-plus url' do
|
135
|
+
UrlScrubber.scrub('http://example.com/+SomeName/posts').should_not eq('http://example.com/+SomeName')
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe 'with slideshare urls' do
|
140
|
+
pending
|
141
|
+
end
|
142
|
+
|
143
|
+
describe 'with flickr urls' do
|
144
|
+
it 'should drop /photos/ from profile names' do
|
145
|
+
UrlScrubber.scrub('http://flickr.com/photos/username/').should eq('http://flickr.com/username')
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'should drop /people/ from profile names' do
|
149
|
+
UrlScrubber.scrub('http://flickr.com/people/username/').should eq('http://flickr.com/username')
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'should drop www. from the beginning' do
|
153
|
+
UrlScrubber.scrub('http://www.flickr.com/username').should eq('http://flickr.com/username')
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should drop path components after the username' do
|
157
|
+
UrlScrubber.scrub('http://flickr.com/username/favorites').should eq('http://flickr.com/username')
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'should not kick in for a non-flickr url' do
|
161
|
+
UrlScrubber.scrub('http://example.com/photos/username/').should_not eq('http://example.com/username')
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
describe 'with pinterest urls' do
|
166
|
+
it 'should drop www. from the beginning' do
|
167
|
+
UrlScrubber.scrub('http://www.pinterest.com/username').should eq('http://pinterest.com/username')
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
describe 'with yelp urls' do
|
172
|
+
it 'should drop www. from the beginning' do
|
173
|
+
UrlScrubber.scrub('http://www.yelp.com/biz/very-important-business').should eq('http://yelp.com/biz/very-important-business')
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe 'with vimeo urls' do
|
178
|
+
pending
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe '.service_of' do
|
183
|
+
it 'returns :youtube for YouTube urls' do
|
184
|
+
UrlScrubber.service_of('http://youtube.com/absolutely').should eq :youtube
|
185
|
+
end
|
186
|
+
|
187
|
+
it 'returns :twitter for Twitter urls' do
|
188
|
+
UrlScrubber.service_of('http://twitter.com/absolutely').should eq :twitter
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'returns :facebook for Facebook urls' do
|
192
|
+
UrlScrubber.service_of('http://facebook.com/person.name').should eq :facebook
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'returns :linkedin for LinkedIn urls' do
|
196
|
+
UrlScrubber.service_of('http://linkedin.com/company/1337').should eq :linkedin
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'returns :google for Google+ urls' do
|
200
|
+
UrlScrubber.service_of('http://plus.google.com/+SomeName').should eq :google
|
201
|
+
end
|
202
|
+
|
203
|
+
it 'returns :slideshare for SlideShare urls' do
|
204
|
+
UrlScrubber.service_of('http://slideshare.net/absolutely').should eq :slideshare
|
205
|
+
end
|
206
|
+
|
207
|
+
it 'returns :flickr for Flickr urls' do
|
208
|
+
UrlScrubber.service_of('http://flickr.com/username').should eq :flickr
|
209
|
+
end
|
210
|
+
|
211
|
+
it 'returns :pinterest for Pinterest urls' do
|
212
|
+
UrlScrubber.service_of('http://pinterest.com/username').should eq :pinterest
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'returns :yelp for Yelp urls' do
|
216
|
+
UrlScrubber.service_of('http://yelp.com/very-important-business').should eq :yelp
|
217
|
+
end
|
218
|
+
|
219
|
+
it 'returns :vimeo for Vimeo urls' do
|
220
|
+
UrlScrubber.service_of('http://vimeo.com/absolutely').should eq :vimeo
|
221
|
+
end
|
222
|
+
|
223
|
+
it 'returns :other for other urls' do
|
224
|
+
UrlScrubber.service_of('http://example.com/page').should eq :other
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
describe '.ideal_form?' do
|
229
|
+
it 'scrubs the url beforehand' do
|
230
|
+
UrlScrubber.should_receive(:scrub).with('http://example.com/some-page/').and_return('http://example.com/some-page')
|
231
|
+
UrlScrubber.ideal_form?('http://example.com/some-page/')
|
232
|
+
end
|
233
|
+
|
234
|
+
describe 'for youtube' do
|
235
|
+
it 'returns true for apparent channel urls' do
|
236
|
+
UrlScrubber.ideal_form?('http://youtube.com/absolutely').should be_true
|
237
|
+
end
|
238
|
+
|
239
|
+
it 'returns false for videos' do
|
240
|
+
UrlScrubber.ideal_form?('http://youtube.com/watch?v=vRGMAW1wzQ8').should be_false
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
describe 'for twitter' do
|
245
|
+
it 'returns true for apparent user urls' do
|
246
|
+
UrlScrubber.ideal_form?('http://twitter.com/absolutely').should be_true
|
247
|
+
end
|
248
|
+
|
249
|
+
it 'returns false for other pages' do
|
250
|
+
UrlScrubber.ideal_form?('http://twitter.com/').should be_false
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
describe 'for facebook' do
|
255
|
+
it 'returns true for apparent user urls' do
|
256
|
+
UrlScrubber.ideal_form?('http://facebook.com/person.name').should be_true
|
257
|
+
UrlScrubber.ideal_form?('http://facebook.com/profile.php?id=543123521').should be_true
|
258
|
+
end
|
259
|
+
|
260
|
+
it 'returns false for other urls' do
|
261
|
+
UrlScrubber.ideal_form?('http://facebook.com/').should be_false
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
describe 'for linkedin' do
|
266
|
+
it 'returns true for apparent company urls' do
|
267
|
+
UrlScrubber.ideal_form?('http://linkedin.com/company/1337').should be_true
|
268
|
+
UrlScrubber.ideal_form?('http://www.linkedin.com/profile/view?id=12341324').should be_true
|
269
|
+
UrlScrubber.ideal_form?('http://linkedin.com/company/brand-name').should be_true
|
270
|
+
end
|
271
|
+
|
272
|
+
it 'returns false for other urls' do
|
273
|
+
UrlScrubber.ideal_form?('http://linkedin.com/jobs/c-Cisco').should be_false
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
describe 'for google plus' do
|
278
|
+
it 'returns true for apparent person urls' do
|
279
|
+
UrlScrubber.ideal_form?('http://plus.google.com/+SomeName').should be_true
|
280
|
+
UrlScrubber.ideal_form?('http://plus.google.com/u/0/b/111111111111111111111').should be_true
|
281
|
+
UrlScrubber.ideal_form?('https://plus.google.com/u/0/5432123454135/posts').should be_true
|
282
|
+
end
|
283
|
+
|
284
|
+
it 'returns false for other urls' do
|
285
|
+
UrlScrubber.ideal_form?('http://plus.google.com/').should be_false
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
describe 'for slideshare' do
|
290
|
+
it 'returns true for apparent user urls' do
|
291
|
+
UrlScrubber.ideal_form?('http://slideshare.net/absolutely').should be_true
|
292
|
+
end
|
293
|
+
|
294
|
+
it 'returns false for other urls' do
|
295
|
+
UrlScrubber.ideal_form?('http://slideshare.net/absolutely/how-to-create-great-slides-for-presentations').should be_false
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
describe 'for flickr' do
|
300
|
+
it 'returns true for apparent user urls' do
|
301
|
+
UrlScrubber.ideal_form?('http://flickr.com/username').should be_true
|
302
|
+
end
|
303
|
+
|
304
|
+
it 'returns false for other urls' do
|
305
|
+
UrlScrubber.ideal_form?('http://flickr.com/').should be_false
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
describe 'for pinterest' do
|
310
|
+
it 'returns true for apparent user urls' do
|
311
|
+
UrlScrubber.ideal_form?('http://pinterest.com/username').should be_true
|
312
|
+
end
|
313
|
+
|
314
|
+
it 'returns false for other urls' do
|
315
|
+
UrlScrubber.ideal_form?('http://pinterest.com/pin/532412513451524').should be_false
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
describe 'for yelp' do
|
320
|
+
it 'returns true for apparent business urls' do
|
321
|
+
UrlScrubber.ideal_form?('http://yelp.com/very-important-business').should be_true
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'returns false for other urls' do
|
325
|
+
UrlScrubber.ideal_form?('http://yelp.com/user_details?userid=Aheunaobuh-huoanuhbAU').should be_false
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
describe 'for vimeo' do
|
330
|
+
it 'returns true for apparent user urls' do
|
331
|
+
UrlScrubber.ideal_form?('http://vimeo.com/absolutely').should be_true
|
332
|
+
end
|
333
|
+
|
334
|
+
it 'returns false for video urls' do
|
335
|
+
UrlScrubber.ideal_form?('http://vimeo.com/45453874578').should be_false
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
describe 'for other sites' do
|
340
|
+
it 'returns true for any other site, really' do
|
341
|
+
UrlScrubber.ideal_form?('http://example.com/absolutely/anything').should be_true
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net"]
|
7
|
+
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
|
+
gem.summary = %q{Clean up URLs.}
|
9
|
+
gem.homepage = "http://brandle.net"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "url_scrubber"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = UrlScrubber::VERSION
|
17
|
+
|
18
|
+
# testing
|
19
|
+
gem.add_development_dependency 'rspec', '~> 2.11.0'
|
20
|
+
gem.add_development_dependency 'guard-bundler', "~> 0.1.3"
|
21
|
+
gem.add_development_dependency 'guard-rspec', "~> 0.4.3"
|
22
|
+
gem.add_development_dependency 'terminal-notifier-guard'
|
23
|
+
gem.add_development_dependency 'rb-fsevent', '~> 0.9.1'
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_scrubber
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Colin Langton
|
9
|
+
- Christopher Maujean
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2013-06-06 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &2152156960 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.11.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2152156960
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: guard-bundler
|
28
|
+
requirement: &2152155880 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.3
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2152155880
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: guard-rspec
|
39
|
+
requirement: &2152154720 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 0.4.3
|
45
|
+
type: :development
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: *2152154720
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: terminal-notifier-guard
|
50
|
+
requirement: &2152153980 !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
type: :development
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *2152153980
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: rb-fsevent
|
61
|
+
requirement: &2152153300 !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ~>
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 0.9.1
|
67
|
+
type: :development
|
68
|
+
prerelease: false
|
69
|
+
version_requirements: *2152153300
|
70
|
+
description: Remove extraneous bits from URLs, follow redirects, identify social media
|
71
|
+
urls, etc.
|
72
|
+
email:
|
73
|
+
- colin@hoteldelta.net
|
74
|
+
- cmaujean@brandle.net
|
75
|
+
executables: []
|
76
|
+
extensions: []
|
77
|
+
extra_rdoc_files: []
|
78
|
+
files:
|
79
|
+
- .gitignore
|
80
|
+
- .rvmrc
|
81
|
+
- Gemfile
|
82
|
+
- Guardfile
|
83
|
+
- README.md
|
84
|
+
- Rakefile
|
85
|
+
- lib/url_scrubber.rb
|
86
|
+
- lib/url_scrubber/version.rb
|
87
|
+
- spec/url_scrubber_spec.rb
|
88
|
+
- url_scrubber.gemspec
|
89
|
+
homepage: http://brandle.net
|
90
|
+
licenses: []
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ! '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ! '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 1.8.16
|
110
|
+
signing_key:
|
111
|
+
specification_version: 3
|
112
|
+
summary: Clean up URLs.
|
113
|
+
test_files:
|
114
|
+
- spec/url_scrubber_spec.rb
|