RubyGems - url_scrubber - Versions diffs - 0.8.12 → 0.8.13 - Mend

url_scrubber 0.8.12 → 0.8.13

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,15 +1,7 @@
 ---
-!binary "U0hBMQ==":
-  metadata.gz: !binary |-
-    ZmJkNTc2MWQ5NmU0NWNkMTZhMmE1NDZmOTFmMGRjOGE4NDc3ZjJkMA==
-  data.tar.gz: !binary |-
-    NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
+SHA256:
+  metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
+  data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
 SHA512:
-  metadata.gz: !binary |-
-    ZWZjMGY0YmVhZDM3OThhZjExNDAzMWUyYzlkYjdlMzg3N2Q0MWM0MGM3ZTkx
-    MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
-    NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
-  data.tar.gz: !binary |-
-    YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
-    NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
-    ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
+  metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
+  data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b

data/lib/url_scrubber.rb CHANGED Viewed

@@ -9,9 +9,9 @@ module UrlScrubber
   def self.scrub(url)
     return url if url.blank?
     return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
     url = url.clone # don't modify the original argument
     m = url.match(/(htt?ps?:\/\/\S*)/i)
     return nil unless m
@@ -25,6 +25,7 @@ module UrlScrubber
     url = downcase_domain(url)
     remove_subdomain!(url)
     remove_html_tags!(url)
+    # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
     url = drop_anchor!(special_cases(url))
     url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
     url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
@@ -33,16 +34,16 @@ module UrlScrubber
   def self.service_of(url)
     domain_match = url.match(%r{https?://([^/]+)})
     if domain_match
       domain = domain_match[1]
       first_dot = domain.index(".")
   	  #first_dot_position = domain.index(".")
   	  #first_dot_position += 1 if first_dot_position
   	  #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
   	  if first_dot
         # tumblr is a unique format
@@ -69,7 +70,7 @@ module UrlScrubber
     end
     :other
-  end
+  end
   def self.ideal_form?(url)
@@ -114,14 +115,14 @@ module UrlScrubber
     return url.include?('http://linkedin.com/company/')
   end
   def self.linkedin_personal_url?(url)
     url = scrub(url)
     return false unless url
-    return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
+    return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
   end
   def self.find_identity_from_url(url)
     UrlScrubber.scrub(url).split("/").last unless url.nil?
   end
@@ -129,7 +130,7 @@ module UrlScrubber
   def self.find_linkedin_identity_from_url(url)
     return nil if url.nil?
-    scrubbed_url = scrub(url)
+    scrubbed_url = scrub(url)
     if scrubbed_url && linkedin_company_url?(scrubbed_url)
       scrubbed_url.split("/").last
     elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
@@ -173,7 +174,7 @@ module UrlScrubber
     public_url
   end
   ################################################################################
   private
   ################################################################################
@@ -197,6 +198,8 @@ module UrlScrubber
     when :pinterest then return sc_pinterest(url)
     when :vimeo     then return sc_vimeo(url)
     when :yelp      then return sc_yelp(url)
+    else
+      sc_generic(url)
     end
     url
@@ -225,7 +228,7 @@ module UrlScrubber
     url
   end
   def self.drop_url_query!(url)
     url.sub!(/\?.*$/, '')
     url
@@ -244,15 +247,16 @@ module UrlScrubber
     # which is not separate channel with it's own customUrl.
     # url.sub!('youtube.com/user/', 'youtube.com/')
     url.sub!('youtube.com/profile?user=', 'youtube.com/')
+    drop_url_query!(url)
     url
   end
   def self.sc_vimeo(url)
-    if url.include?('vimeo.com/groups/')
+    if url.include?('vimeo.com/groups/')
       groups_partition = url.partition('vimeo.com/groups/')
       if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
-        extraneous_slash_partition = groups_partition[2].partition('/')
+        extraneous_slash_partition = groups_partition[2].partition('/')
         if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
           # need to trim off the sub page stuff
           return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
@@ -276,18 +280,20 @@ module UrlScrubber
       url = "http://twitter.com/#{search_match[1]}"
     end
+    url = drop_url_query!(url)
     url
   end
   def self.sc_facebook(url)
     #puts "sc_facebook: #{url}"
-    regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
-    regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
+    regex1  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
+    regex2  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
     regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
-    regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pg\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
-    regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
+    regex3  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
+    regex4  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
     # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
     # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
     if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
@@ -332,14 +338,14 @@ module UrlScrubber
       url = drop_url_query!(url)
     elsif url.include?("facebook.com/profile.php?id=")
       # puts "profile.php"
-      # these were being truncated, they do redirect, but typically a 301 response is generated
+      # these were being truncated, they do redirect, but typically a 301 response is generated
       # so the url is returned unchanged.  Better than truncation.
       url, http_response = check_for_facebook_redirection(url)
     else
       # puts "else"
       url = drop_url_query!(url)
     end
     # Due to the redirection check, "https" and "www." can be re-introduced
     url = url.sub(%r{^https?://www.}i, 'http://')
     url = url.sub(/\?_rdr.*/, '')
@@ -348,7 +354,6 @@ module UrlScrubber
   def self.sc_linkedin(url)
     url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
     if !!url.match(%r{com/company/})
       drop_url_query!(url)
@@ -388,10 +393,10 @@ module UrlScrubber
   def self.sc_flickr(url)
-    if url.include?('flickr.com/groups/')
+    if url.include?('flickr.com/groups/')
       groups_partition = url.partition('flickr.com/groups/')
       if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
-        extraneous_slash_partition = groups_partition[2].partition('/')
+        extraneous_slash_partition = groups_partition[2].partition('/')
         if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
           # need to trim off the sub page stuff
           return "http://flickr.com/groups/" + extraneous_slash_partition[0]
@@ -408,14 +413,22 @@ module UrlScrubber
   def self.sc_pinterest(url)
+    drop_url_query!(url)
     url
   end
   def self.sc_yelp(url)
+    drop_url_query!(url)
     url
   end
+  def self.sc_generic(url)
+    drop_url_query!(url)
+    url
+  end
   def self.check_for_facebook_redirection(uri_str, limit = 5)
     #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
@@ -434,9 +447,9 @@ module UrlScrubber
     uri_str_new = uri_str.sub('http://', 'https://')
     uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
     begin
-      url = URI.parse(URI.escape(uri_str_new))
+      url = URI.parse(URI.escape(uri_str_new))
     rescue URI::InvalidURIError => e
       return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
     end
@@ -444,7 +457,7 @@ module UrlScrubber
     http = Net::HTTP.new(url.host, url.port)
     http = Net::HTTP.new(url.host, url.port)
     http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
-    http.read_timeout = 10 # and up to 10 seconds for a response
+    http.read_timeout = 10 # and up to 10 seconds for a response
     if url.port == 443
       http.use_ssl = true
       http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -454,7 +467,7 @@ module UrlScrubber
     request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
     begin
-      response = http.request(request)
+      response = http.request(request)
     rescue Timeout::Error
       #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
       failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
@@ -494,5 +507,5 @@ module UrlScrubber
       return [uri_str_new, response]
     end
   end
 end

data/lib/url_scrubber/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlScrubber
-  VERSION = "0.8.12"
+  VERSION = "0.8.13"
 end

data/spec/url_scrubber_spec.rb CHANGED Viewed

@@ -69,6 +69,10 @@ describe UrlScrubber do
       it "should transform user statuses into that user's profile" do
         UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
       end
+      it "should drop the query part of the url" do
+        UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
+      end
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: url_scrubber
 version: !ruby/object:Gem::Version
-  version: 0.8.12
+  version: 0.8.13
 platform: ruby
 authors:
 - Colin Langton
@@ -11,76 +11,76 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-05-20 00:00:00.000000000 Z
+date: 2018-11-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 2.11.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 2.11.0
 - !ruby/object:Gem::Dependency
   name: guard-bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.1.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.1.3
 - !ruby/object:Gem::Dependency
   name: guard-rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.4.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.4.3
 - !ruby/object:Gem::Dependency
   name: terminal-notifier-guard
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rb-fsevent
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.9.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.9.1
 description: Remove extraneous bits from URLs, follow redirects, identify social media
@@ -94,8 +94,8 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .rvmrc
+- ".gitignore"
+- ".rvmrc"
 - Gemfile
 - Guardfile
 - README.md
@@ -114,17 +114,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.7.7
 signing_key:
 specification_version: 4
 summary: Clean up URLs.