url_scrubber 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +33 -0
- data/.rvmrc +48 -0
- data/Gemfile +4 -0
- data/Guardfile +14 -0
- data/README.md +29 -0
- data/Rakefile +10 -0
- data/lib/url_scrubber/version.rb +3 -0
- data/lib/url_scrubber.rb +172 -0
- data/spec/url_scrubber_spec.rb +345 -0
- data/url_scrubber.gemspec +24 -0
- metadata +114 -0
data/.gitignore
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
**.orig
|
2
|
+
*.gem
|
3
|
+
*.rbc
|
4
|
+
*.sassc
|
5
|
+
.bundle
|
6
|
+
.config
|
7
|
+
.DS_Store
|
8
|
+
.rspec
|
9
|
+
.sass-cache
|
10
|
+
.yardoc
|
11
|
+
/.bundle
|
12
|
+
/coverage/
|
13
|
+
/db/*.sqlite3
|
14
|
+
/log/*
|
15
|
+
/public/system/*
|
16
|
+
/spec/tmp/*
|
17
|
+
/tmp/*
|
18
|
+
/vendor/bundle
|
19
|
+
_yardoc
|
20
|
+
capybara-*.html
|
21
|
+
coverage
|
22
|
+
doc/
|
23
|
+
Gemfile.lock
|
24
|
+
InstalledFiles
|
25
|
+
lib/bundler/man
|
26
|
+
pickle-email-*.html
|
27
|
+
pkg
|
28
|
+
rdoc
|
29
|
+
rerun.txt
|
30
|
+
spec/reports
|
31
|
+
test/tmp
|
32
|
+
test/version_tmp
|
33
|
+
tmp
|
data/.rvmrc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3-p194@url_scrubber"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.16.19 (stable)" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
else
|
29
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
30
|
+
rvm --create "$environment_id" || {
|
31
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
32
|
+
return 1
|
33
|
+
}
|
34
|
+
fi
|
35
|
+
|
36
|
+
# If you use bundler, this might be useful to you:
|
37
|
+
# if [[ -s Gemfile ]] && {
|
38
|
+
# ! builtin command -v bundle >/dev/null ||
|
39
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
40
|
+
# }
|
41
|
+
# then
|
42
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
43
|
+
# gem install bundler
|
44
|
+
# fi
|
45
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
46
|
+
# then
|
47
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
48
|
+
# fi
|
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'bundler' do
|
5
|
+
watch('Gemfile')
|
6
|
+
# Uncomment next line if Gemfile contain `gemspec' command
|
7
|
+
# watch(/^.+\.gemspec/)
|
8
|
+
end
|
9
|
+
|
10
|
+
guard 'rspec', :version => 2, :cli => '-c' do
|
11
|
+
watch(%r{^spec/.+_spec\.rb$})
|
12
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
13
|
+
watch('spec/spec_helper.rb') { "spec" }
|
14
|
+
end
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# UrlScrubber
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'url_scrubber'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install url_scrubber
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/url_scrubber.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
require "url_scrubber/version"
|
2
|
+
|
3
|
+
module UrlScrubber
|
4
|
+
def self.scrub(url)
|
5
|
+
url = url.clone # don't modify the original argument
|
6
|
+
m = url.match(/(https?:\/\/\S*)/i)
|
7
|
+
return nil unless m
|
8
|
+
|
9
|
+
url = m[1]
|
10
|
+
url.sub!(/^https/, 'http')
|
11
|
+
url.sub!(/\/+$/, '')
|
12
|
+
url.sub!('#!/', '')
|
13
|
+
url = downcase_domain(url)
|
14
|
+
remove_www!(url)
|
15
|
+
drop_anchor!(special_cases(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.service_of(url)
|
19
|
+
domain_match = url.match(%r{https?://([^/]+)})
|
20
|
+
|
21
|
+
if domain_match
|
22
|
+
domain = domain_match[1]
|
23
|
+
|
24
|
+
case domain
|
25
|
+
when /\byoutube\.com$/ then return :youtube
|
26
|
+
when /\btwitter\.com$/ then return :twitter
|
27
|
+
when /\bfacebook\.com$/ then return :facebook
|
28
|
+
when /\blinkedin\.com$/ then return :linkedin
|
29
|
+
when /\bplus\.google\.com$/ then return :google
|
30
|
+
when /\bslideshare\.net$/ then return :slideshare
|
31
|
+
when /\bflickr\.com$/ then return :flickr
|
32
|
+
when /\bpinterest\.com$/ then return :pinterest
|
33
|
+
when /\bvimeo\.com$/ then return :vimeo
|
34
|
+
when /\byelp\.com$/ then return :yelp
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
:other
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.ideal_form?(url)
|
42
|
+
url = scrub(url)
|
43
|
+
return false unless url
|
44
|
+
|
45
|
+
case service_of(url)
|
46
|
+
when :youtube
|
47
|
+
!!url.match(%r{^http://youtube\.com/[\w_-]+$})
|
48
|
+
when :twitter
|
49
|
+
!!url.match(%r{^http://twitter\.com/[\w_]+$})
|
50
|
+
when :facebook
|
51
|
+
!!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$})
|
52
|
+
when :linkedin
|
53
|
+
!!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$})
|
54
|
+
when :google
|
55
|
+
!!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$})
|
56
|
+
when :slideshare
|
57
|
+
!!url.match(%r{^http://slideshare\.net/[\w_-]+$})
|
58
|
+
when :flickr
|
59
|
+
!!url.match(%r{^http://flickr\.com/[\w_-]+$})
|
60
|
+
when :pinterest
|
61
|
+
!!url.match(%r{^http://pinterest\.com/[\w_-]+$})
|
62
|
+
when :yelp
|
63
|
+
!!url.match(%r{^http://yelp\.com/[\w_-]+$})
|
64
|
+
when :vimeo
|
65
|
+
!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})
|
66
|
+
else
|
67
|
+
true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def self.downcase_domain(url)
|
74
|
+
domain_match = url.match(%r{http://[^/]+}i)
|
75
|
+
domain_match[0].downcase + domain_match.post_match
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.special_cases(url)
|
79
|
+
case service_of(url)
|
80
|
+
when :youtube then return youtube(url)
|
81
|
+
when :twitter then return twitter(url)
|
82
|
+
when :facebook then return facebook(url)
|
83
|
+
when :linkedin then return linkedin(url)
|
84
|
+
when :google then return google_plus(url)
|
85
|
+
when :flickr then return flickr(url)
|
86
|
+
when :pinterest then return pinterest(url)
|
87
|
+
when :yelp then return yelp(url)
|
88
|
+
end
|
89
|
+
|
90
|
+
url
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.remove_www!(url)
|
94
|
+
url.sub!(%r{://www\d*\.}, '://')
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.drop_url_query!(url)
|
99
|
+
url.sub!(/\?.*$/, '')
|
100
|
+
url
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.drop_anchor!(url)
|
104
|
+
url.sub!(/#.*$/, '')
|
105
|
+
url
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.youtube(url)
|
109
|
+
url.sub!('youtube.com/user/', 'youtube.com/')
|
110
|
+
url.sub!('youtube.com/profile?user=', 'youtube.com/')
|
111
|
+
url
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.twitter(url)
|
115
|
+
url.sub!('twitter.com/@', 'twitter.com/')
|
116
|
+
|
117
|
+
status_match = url.match(%r{(twitter\.com/[^/]+)/statuses/\d+})
|
118
|
+
if status_match
|
119
|
+
url = "http://#{status_match[1]}"
|
120
|
+
end
|
121
|
+
|
122
|
+
search_match = url.match(%r{twitter\.com/search(?:/realtime)?(?:/|\?q=)(?:@|%40)(\S*)$})
|
123
|
+
if search_match
|
124
|
+
url = "http://twitter.com/#{search_match[1]}"
|
125
|
+
end
|
126
|
+
|
127
|
+
url
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.facebook(url)
|
131
|
+
if url.match("/media/albums") || url.match("/media/set")
|
132
|
+
url = url.match('\&') ? url.split('&',2)[0] : url
|
133
|
+
else
|
134
|
+
url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
|
135
|
+
url = drop_url_query!(url)
|
136
|
+
end
|
137
|
+
url
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.linkedin(url)
|
141
|
+
|
142
|
+
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
143
|
+
drop_url_query!(url) if !!url.match(%r{com/company/})
|
144
|
+
url
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.google_plus(url)
|
148
|
+
url.sub!('com/u/0/b/', 'com/')
|
149
|
+
url.sub!('com/u/0/', 'com/')
|
150
|
+
url.sub!('com/b/', 'com/')
|
151
|
+
|
152
|
+
path_match = url.match(/^http:\/\/plus\.google\.com\/([^\/]+)/)
|
153
|
+
return url unless path_match
|
154
|
+
|
155
|
+
"http://plus.google.com/#{path_match[1]}"
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.flickr(url)
|
159
|
+
user_match = url.match(%r{flickr\.com/(photos/|people/)?([^/]+)})
|
160
|
+
return url unless user_match
|
161
|
+
|
162
|
+
"http://flickr.com/#{user_match[2]}"
|
163
|
+
end
|
164
|
+
|
165
|
+
def self.pinterest(url)
|
166
|
+
url
|
167
|
+
end
|
168
|
+
|
169
|
+
def self.yelp(url)
|
170
|
+
url
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,345 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'url_scrubber'
|
3
|
+
|
4
|
+
describe UrlScrubber do
|
5
|
+
describe '.scrub' do
|
6
|
+
it 'should downcase all domain stuff' do
|
7
|
+
UrlScrubber.scrub("http://ExAmple.COM").should eq('http://example.com')
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should remove whitespace and stray characters around an url' do
|
11
|
+
UrlScrubber.scrub("d http://example.com\t").should eq("http://example.com")
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should turn an https url into http' do
|
15
|
+
UrlScrubber.scrub('https://example.com').should eq('http://example.com')
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should remove trailing slashes' do
|
19
|
+
UrlScrubber.scrub('http://example.com/').should eq('http://example.com')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should remove #!/' do
|
23
|
+
UrlScrubber.scrub('http://example.com/#!/page').should eq('http://example.com/page')
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should drop #anchors' do
|
27
|
+
UrlScrubber.scrub('http://example.com/page#query:example+pages').should eq('http://example.com/page')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should drop www' do
|
31
|
+
UrlScrubber.scrub('http://www.example.com/page').should eq('http://example.com/page')
|
32
|
+
end
|
33
|
+
|
34
|
+
describe 'with youtube urls' do
|
35
|
+
it 'should drop /user/ if it exists' do
|
36
|
+
UrlScrubber.scrub('http://youtube.com/user/absolutely').should eq('http://youtube.com/absolutely')
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should handle /profile?user= urls' do
|
40
|
+
UrlScrubber.scrub('http://youtube.com/profile?user=absolutely').should eq('http://youtube.com/absolutely')
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should not kick in for a non-youtube url' do
|
44
|
+
UrlScrubber.scrub('http://example.com/user/absolutely').should_not eq('http://example.com/absolutely')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'with twitter urls' do
|
49
|
+
it 'should drop @ from in front of username' do
|
50
|
+
UrlScrubber.scrub('http://twitter.com/@absolutely').should eq('http://twitter.com/absolutely')
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should not kick in for a non-twitter url' do
|
54
|
+
UrlScrubber.scrub('http://example.com/@absolutely').should_not eq('http://example.com/absolutely')
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should transform /search/realtime/%40username into that username' do
|
58
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime/%40absolutely').should eq('http://twitter.com/absolutely')
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should handle similar search urls' do
|
62
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime/@absolutely').should eq('http://twitter.com/absolutely')
|
63
|
+
UrlScrubber.scrub('http://twitter.com/search/%40absolutely').should eq('http://twitter.com/absolutely')
|
64
|
+
UrlScrubber.scrub('http://twitter.com/search/@absolutely').should eq('http://twitter.com/absolutely')
|
65
|
+
UrlScrubber.scrub('http://twitter.com/search/realtime?q=%40absolutely').should eq('http://twitter.com/absolutely')
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should transform user statuses into that user's profile" do
|
69
|
+
UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe 'with facebook urls' do
|
74
|
+
it 'should drop /home.php?#!/ from the beginning of the path' do
|
75
|
+
UrlScrubber.scrub('http://facebook.com/home.php?#!/person.name').should eq('http://facebook.com/person.name')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should drop www. from the beginning' do
|
79
|
+
UrlScrubber.scrub('http://www.facebook.com/person.name').should eq('http://facebook.com/person.name')
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should drop an url query' do
|
83
|
+
UrlScrubber.scrub('http://facebook.com/person.name?ref=pb').should eq('http://facebook.com/person.name')
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'should not kick in for a non-facebook url' do
|
87
|
+
UrlScrubber.scrub('http://example.com/home.php?#!/person.name').should_not eq('http://example.com/person.name')
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
describe 'with linkedin urls' do
|
92
|
+
it 'should change /companies/ to /company/' do
|
93
|
+
UrlScrubber.scrub('http://linkedin.com/companies/1337').should eq('http://linkedin.com/company/1337')
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'should drop www. from the beginning' do
|
97
|
+
UrlScrubber.scrub('http://www.linkedin.com/company/1337').should eq('http://linkedin.com/company/1337')
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should drop query parameters from the end of a company url' do
|
101
|
+
UrlScrubber.scrub('http://linkedin.com/company/1337?trk=tyah').should eq('http://linkedin.com/company/1337')
|
102
|
+
end
|
103
|
+
|
104
|
+
it 'should not drop query parameters a profile view url' do
|
105
|
+
UrlScrubber.scrub('http://www.linkedin.com/profile/view?id=12341324').should eq('http://linkedin.com/profile/view?id=12341324')
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'should not kick in for a non-linkedin url' do
|
109
|
+
UrlScrubber.scrub('http://example.com/companies/1337').should_not eq('http://example.com/company/1337')
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe 'with google plus urls' do
|
114
|
+
it 'should drop anything after the first significant path component' do
|
115
|
+
UrlScrubber.scrub('http://plus.google.com/+SomeName/posts').should eq('http://plus.google.com/+SomeName')
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'should drop u/0/b/ from the beginning of the path' do
|
119
|
+
UrlScrubber.scrub(
|
120
|
+
'http://plus.google.com/u/0/b/111111111111111111111'
|
121
|
+
).should eq(
|
122
|
+
'http://plus.google.com/111111111111111111111'
|
123
|
+
)
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should drop u/0/ from the beginning of the path' do
|
127
|
+
UrlScrubber.scrub(
|
128
|
+
'https://plus.google.com/u/0/5432123454135/posts'
|
129
|
+
).should eq(
|
130
|
+
'http://plus.google.com/5432123454135'
|
131
|
+
)
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'should not kick in for a non-google-plus url' do
|
135
|
+
UrlScrubber.scrub('http://example.com/+SomeName/posts').should_not eq('http://example.com/+SomeName')
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe 'with slideshare urls' do
|
140
|
+
pending
|
141
|
+
end
|
142
|
+
|
143
|
+
describe 'with flickr urls' do
|
144
|
+
it 'should drop /photos/ from profile names' do
|
145
|
+
UrlScrubber.scrub('http://flickr.com/photos/username/').should eq('http://flickr.com/username')
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'should drop /people/ from profile names' do
|
149
|
+
UrlScrubber.scrub('http://flickr.com/people/username/').should eq('http://flickr.com/username')
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'should drop www. from the beginning' do
|
153
|
+
UrlScrubber.scrub('http://www.flickr.com/username').should eq('http://flickr.com/username')
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should drop path components after the username' do
|
157
|
+
UrlScrubber.scrub('http://flickr.com/username/favorites').should eq('http://flickr.com/username')
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'should not kick in for a non-flickr url' do
|
161
|
+
UrlScrubber.scrub('http://example.com/photos/username/').should_not eq('http://example.com/username')
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
describe 'with pinterest urls' do
|
166
|
+
it 'should drop www. from the beginning' do
|
167
|
+
UrlScrubber.scrub('http://www.pinterest.com/username').should eq('http://pinterest.com/username')
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
describe 'with yelp urls' do
|
172
|
+
it 'should drop www. from the beginning' do
|
173
|
+
UrlScrubber.scrub('http://www.yelp.com/biz/very-important-business').should eq('http://yelp.com/biz/very-important-business')
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe 'with vimeo urls' do
|
178
|
+
pending
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe '.service_of' do
|
183
|
+
it 'returns :youtube for YouTube urls' do
|
184
|
+
UrlScrubber.service_of('http://youtube.com/absolutely').should eq :youtube
|
185
|
+
end
|
186
|
+
|
187
|
+
it 'returns :twitter for Twitter urls' do
|
188
|
+
UrlScrubber.service_of('http://twitter.com/absolutely').should eq :twitter
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'returns :facebook for Facebook urls' do
|
192
|
+
UrlScrubber.service_of('http://facebook.com/person.name').should eq :facebook
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'returns :linkedin for LinkedIn urls' do
|
196
|
+
UrlScrubber.service_of('http://linkedin.com/company/1337').should eq :linkedin
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'returns :google for Google+ urls' do
|
200
|
+
UrlScrubber.service_of('http://plus.google.com/+SomeName').should eq :google
|
201
|
+
end
|
202
|
+
|
203
|
+
it 'returns :slideshare for SlideShare urls' do
|
204
|
+
UrlScrubber.service_of('http://slideshare.net/absolutely').should eq :slideshare
|
205
|
+
end
|
206
|
+
|
207
|
+
it 'returns :flickr for Flickr urls' do
|
208
|
+
UrlScrubber.service_of('http://flickr.com/username').should eq :flickr
|
209
|
+
end
|
210
|
+
|
211
|
+
it 'returns :pinterest for Pinterest urls' do
|
212
|
+
UrlScrubber.service_of('http://pinterest.com/username').should eq :pinterest
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'returns :yelp for Yelp urls' do
|
216
|
+
UrlScrubber.service_of('http://yelp.com/very-important-business').should eq :yelp
|
217
|
+
end
|
218
|
+
|
219
|
+
it 'returns :vimeo for Vimeo urls' do
|
220
|
+
UrlScrubber.service_of('http://vimeo.com/absolutely').should eq :vimeo
|
221
|
+
end
|
222
|
+
|
223
|
+
it 'returns :other for other urls' do
|
224
|
+
UrlScrubber.service_of('http://example.com/page').should eq :other
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
describe '.ideal_form?' do
|
229
|
+
it 'scrubs the url beforehand' do
|
230
|
+
UrlScrubber.should_receive(:scrub).with('http://example.com/some-page/').and_return('http://example.com/some-page')
|
231
|
+
UrlScrubber.ideal_form?('http://example.com/some-page/')
|
232
|
+
end
|
233
|
+
|
234
|
+
describe 'for youtube' do
|
235
|
+
it 'returns true for apparent channel urls' do
|
236
|
+
UrlScrubber.ideal_form?('http://youtube.com/absolutely').should be_true
|
237
|
+
end
|
238
|
+
|
239
|
+
it 'returns false for videos' do
|
240
|
+
UrlScrubber.ideal_form?('http://youtube.com/watch?v=vRGMAW1wzQ8').should be_false
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
describe 'for twitter' do
|
245
|
+
it 'returns true for apparent user urls' do
|
246
|
+
UrlScrubber.ideal_form?('http://twitter.com/absolutely').should be_true
|
247
|
+
end
|
248
|
+
|
249
|
+
it 'returns false for other pages' do
|
250
|
+
UrlScrubber.ideal_form?('http://twitter.com/').should be_false
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
describe 'for facebook' do
|
255
|
+
it 'returns true for apparent user urls' do
|
256
|
+
UrlScrubber.ideal_form?('http://facebook.com/person.name').should be_true
|
257
|
+
UrlScrubber.ideal_form?('http://facebook.com/profile.php?id=543123521').should be_true
|
258
|
+
end
|
259
|
+
|
260
|
+
it 'returns false for other urls' do
|
261
|
+
UrlScrubber.ideal_form?('http://facebook.com/').should be_false
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
describe 'for linkedin' do
|
266
|
+
it 'returns true for apparent company urls' do
|
267
|
+
UrlScrubber.ideal_form?('http://linkedin.com/company/1337').should be_true
|
268
|
+
UrlScrubber.ideal_form?('http://www.linkedin.com/profile/view?id=12341324').should be_true
|
269
|
+
UrlScrubber.ideal_form?('http://linkedin.com/company/brand-name').should be_true
|
270
|
+
end
|
271
|
+
|
272
|
+
it 'returns false for other urls' do
|
273
|
+
UrlScrubber.ideal_form?('http://linkedin.com/jobs/c-Cisco').should be_false
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
describe 'for google plus' do
|
278
|
+
it 'returns true for apparent person urls' do
|
279
|
+
UrlScrubber.ideal_form?('http://plus.google.com/+SomeName').should be_true
|
280
|
+
UrlScrubber.ideal_form?('http://plus.google.com/u/0/b/111111111111111111111').should be_true
|
281
|
+
UrlScrubber.ideal_form?('https://plus.google.com/u/0/5432123454135/posts').should be_true
|
282
|
+
end
|
283
|
+
|
284
|
+
it 'returns false for other urls' do
|
285
|
+
UrlScrubber.ideal_form?('http://plus.google.com/').should be_false
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
describe 'for slideshare' do
|
290
|
+
it 'returns true for apparent user urls' do
|
291
|
+
UrlScrubber.ideal_form?('http://slideshare.net/absolutely').should be_true
|
292
|
+
end
|
293
|
+
|
294
|
+
it 'returns false for other urls' do
|
295
|
+
UrlScrubber.ideal_form?('http://slideshare.net/absolutely/how-to-create-great-slides-for-presentations').should be_false
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
describe 'for flickr' do
|
300
|
+
it 'returns true for apparent user urls' do
|
301
|
+
UrlScrubber.ideal_form?('http://flickr.com/username').should be_true
|
302
|
+
end
|
303
|
+
|
304
|
+
it 'returns false for other urls' do
|
305
|
+
UrlScrubber.ideal_form?('http://flickr.com/').should be_false
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
describe 'for pinterest' do
|
310
|
+
it 'returns true for apparent user urls' do
|
311
|
+
UrlScrubber.ideal_form?('http://pinterest.com/username').should be_true
|
312
|
+
end
|
313
|
+
|
314
|
+
it 'returns false for other urls' do
|
315
|
+
UrlScrubber.ideal_form?('http://pinterest.com/pin/532412513451524').should be_false
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
describe 'for yelp' do
|
320
|
+
it 'returns true for apparent business urls' do
|
321
|
+
UrlScrubber.ideal_form?('http://yelp.com/very-important-business').should be_true
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'returns false for other urls' do
|
325
|
+
UrlScrubber.ideal_form?('http://yelp.com/user_details?userid=Aheunaobuh-huoanuhbAU').should be_false
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
describe 'for vimeo' do
|
330
|
+
it 'returns true for apparent user urls' do
|
331
|
+
UrlScrubber.ideal_form?('http://vimeo.com/absolutely').should be_true
|
332
|
+
end
|
333
|
+
|
334
|
+
it 'returns false for video urls' do
|
335
|
+
UrlScrubber.ideal_form?('http://vimeo.com/45453874578').should be_false
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
describe 'for other sites' do
|
340
|
+
it 'returns true for any other site, really' do
|
341
|
+
UrlScrubber.ideal_form?('http://example.com/absolutely/anything').should be_true
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net"]
|
7
|
+
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
|
+
gem.summary = %q{Clean up URLs.}
|
9
|
+
gem.homepage = "http://brandle.net"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "url_scrubber"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = UrlScrubber::VERSION
|
17
|
+
|
18
|
+
# testing
|
19
|
+
gem.add_development_dependency 'rspec', '~> 2.11.0'
|
20
|
+
gem.add_development_dependency 'guard-bundler', "~> 0.1.3"
|
21
|
+
gem.add_development_dependency 'guard-rspec', "~> 0.4.3"
|
22
|
+
gem.add_development_dependency 'terminal-notifier-guard'
|
23
|
+
gem.add_development_dependency 'rb-fsevent', '~> 0.9.1'
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_scrubber
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Colin Langton
|
9
|
+
- Christopher Maujean
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2013-06-06 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &2152156960 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.11.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2152156960
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: guard-bundler
|
28
|
+
requirement: &2152155880 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.3
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2152155880
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: guard-rspec
|
39
|
+
requirement: &2152154720 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 0.4.3
|
45
|
+
type: :development
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: *2152154720
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: terminal-notifier-guard
|
50
|
+
requirement: &2152153980 !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
type: :development
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *2152153980
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: rb-fsevent
|
61
|
+
requirement: &2152153300 !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ~>
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 0.9.1
|
67
|
+
type: :development
|
68
|
+
prerelease: false
|
69
|
+
version_requirements: *2152153300
|
70
|
+
description: Remove extraneous bits from URLs, follow redirects, identify social media
|
71
|
+
urls, etc.
|
72
|
+
email:
|
73
|
+
- colin@hoteldelta.net
|
74
|
+
- cmaujean@brandle.net
|
75
|
+
executables: []
|
76
|
+
extensions: []
|
77
|
+
extra_rdoc_files: []
|
78
|
+
files:
|
79
|
+
- .gitignore
|
80
|
+
- .rvmrc
|
81
|
+
- Gemfile
|
82
|
+
- Guardfile
|
83
|
+
- README.md
|
84
|
+
- Rakefile
|
85
|
+
- lib/url_scrubber.rb
|
86
|
+
- lib/url_scrubber/version.rb
|
87
|
+
- spec/url_scrubber_spec.rb
|
88
|
+
- url_scrubber.gemspec
|
89
|
+
homepage: http://brandle.net
|
90
|
+
licenses: []
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ! '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ! '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 1.8.16
|
110
|
+
signing_key:
|
111
|
+
specification_version: 3
|
112
|
+
summary: Clean up URLs.
|
113
|
+
test_files:
|
114
|
+
- spec/url_scrubber_spec.rb
|