RubyGems - url_scrubber - Versions diffs - 0.8.12 → 0.8.13 - Mend

url_scrubber 0.8.12 → 0.8.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,15 +1,7 @@
 ---
-!binary "U0hBMQ==":
-  metadata.gz: !binary |-
-    ZmJkNTc2MWQ5NmU0NWNkMTZhMmE1NDZmOTFmMGRjOGE4NDc3ZjJkMA==
-  data.tar.gz: !binary |-
-    NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
+SHA256:
+  metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
+  data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
 SHA512:
-  metadata.gz: !binary |-
-    ZWZjMGY0YmVhZDM3OThhZjExNDAzMWUyYzlkYjdlMzg3N2Q0MWM0MGM3ZTkx
-    MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
-    NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
-  data.tar.gz: !binary |-
-    YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
-    NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
-    ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
+  metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
+  data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b

data/lib/url_scrubber.rb CHANGED Viewed

@@ -9,9 +9,9 @@ module UrlScrubber
   def self.scrub(url)
     return url if url.blank?
     return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
     url = url.clone # don't modify the original argument
     m = url.match(/(htt?ps?:\/\/\S*)/i)
     return nil unless m
@@ -25,6 +25,7 @@ module UrlScrubber
     url = downcase_domain(url)
     remove_subdomain!(url)
     remove_html_tags!(url)
+    # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
     url = drop_anchor!(special_cases(url))
     url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
     url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
@@ -33,16 +34,16 @@ module UrlScrubber
   def self.service_of(url)
     domain_match = url.match(%r{https?://([^/]+)})
     if domain_match
       domain = domain_match[1]
       first_dot = domain.index(".")
   	  #first_dot_position = domain.index(".")
   	  #first_dot_position += 1 if first_dot_position
   	  #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
   	  if first_dot
         # tumblr is a unique format
@@ -69,7 +70,7 @@ module UrlScrubber
     end
     :other
-  end
+  end
   def self.ideal_form?(url)
@@ -114,14 +115,14 @@ module UrlScrubber
     return url.include?('http://linkedin.com/company/')
   end
   def self.linkedin_personal_url?(url)
     url = scrub(url)
     return false unless url
-    return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
+    return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
   end
   def self.find_identity_from_url(url)
     UrlScrubber.scrub(url).split("/").last unless url.nil?
   end
@@ -129,7 +130,7 @@ module UrlScrubber
   def self.find_linkedin_identity_from_url(url)
     return nil if url.nil?
-    scrubbed_url = scrub(url)
+    scrubbed_url = scrub(url)
     if scrubbed_url && linkedin_company_url?(scrubbed_url)
       scrubbed_url.split("/").last
     elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
@@ -173,7 +174,7 @@ module UrlScrubber
     public_url
   end
   ################################################################################
   private
   ################################################################################
@@ -197,6 +198,8 @@ module UrlScrubber
     when :pinterest then return sc_pinterest(url)
     when :vimeo     then return sc_vimeo(url)
     when :yelp      then return sc_yelp(url)
+    else
+      sc_generic(url)
     end
     url
@@ -225,7 +228,7 @@ module UrlScrubber
     url
   end
   def self.drop_url_query!(url)
     url.sub!(/\?.*$/, '')
     url
@@ -244,15 +247,16 @@ module UrlScrubber
     # which is not separate channel with it's own customUrl.
     # url.sub!('youtube.com/user/', 'youtube.com/')
     url.sub!('youtube.com/profile?user=', 'youtube.com/')
+    drop_url_query!(url)
     url
   end
   def self.sc_vimeo(url)
-    if url.include?('vimeo.com/groups/')
+    if url.include?('vimeo.com/groups/')
       groups_partition = url.partition('vimeo.com/groups/')
       if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
-        extraneous_slash_partition = groups_partition[2].partition('/')
+        extraneous_slash_partition = groups_partition[2].partition('/')
         if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
           # need to trim off the sub page stuff
           return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
@@ -276,18 +280,20 @@ module UrlScrubber
       url = "http://twitter.com/#{search_match[1]}"
     end
+    url = drop_url_query!(url)
     url
   end
   def self.sc_facebook(url)
     #puts "sc_facebook: #{url}"
-    regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
-    regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
+    regex1  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
+    regex2  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
     regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
-    regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pg\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
-    regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
+    regex3  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
+    regex4  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
     # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
     # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
     if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
@@ -332,14 +338,14 @@ module UrlScrubber
       url = drop_url_query!(url)
     elsif url.include?("facebook.com/profile.php?id=")
       # puts "profile.php"
-      # these were being truncated, they do redirect, but typically a 301 response is generated
+      # these were being truncated, they do redirect, but typically a 301 response is generated
       # so the url is returned unchanged.  Better than truncation.
       url, http_response = check_for_facebook_redirection(url)
     else
       # puts "else"
       url = drop_url_query!(url)
     end
     # Due to the redirection check, "https" and "www." can be re-introduced
     url = url.sub(%r{^https?://www.}i, 'http://')
     url = url.sub(/\?_rdr.*/, '')
@@ -348,7 +354,6 @@ module UrlScrubber
   def self.sc_linkedin(url)
     url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
     if !!url.match(%r{com/company/})
       drop_url_query!(url)
@@ -388,10 +393,10 @@ module UrlScrubber
   def self.sc_flickr(url)
-    if url.include?('flickr.com/groups/')
+    if url.include?('flickr.com/groups/')
       groups_partition = url.partition('flickr.com/groups/')
       if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
-        extraneous_slash_partition = groups_partition[2].partition('/')
+        extraneous_slash_partition = groups_partition[2].partition('/')
         if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
           # need to trim off the sub page stuff
           return "http://flickr.com/groups/" + extraneous_slash_partition[0]
@@ -408,14 +413,22 @@ module UrlScrubber
   def self.sc_pinterest(url)
+    drop_url_query!(url)
     url
   end
   def self.sc_yelp(url)
+    drop_url_query!(url)
     url
   end
+  def self.sc_generic(url)
+    drop_url_query!(url)
+    url
+  end
   def self.check_for_facebook_redirection(uri_str, limit = 5)
     #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
@@ -434,9 +447,9 @@ module UrlScrubber
     uri_str_new = uri_str.sub('http://', 'https://')
     uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
     begin
-      url = URI.parse(URI.escape(uri_str_new))
+      url = URI.parse(URI.escape(uri_str_new))
     rescue URI::InvalidURIError => e
       return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
     end
@@ -444,7 +457,7 @@ module UrlScrubber
     http = Net::HTTP.new(url.host, url.port)
     http = Net::HTTP.new(url.host, url.port)
     http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
-    http.read_timeout = 10 # and up to 10 seconds for a response
+    http.read_timeout = 10 # and up to 10 seconds for a response
     if url.port == 443
       http.use_ssl = true
       http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -454,7 +467,7 @@ module UrlScrubber
     request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
     begin
-      response = http.request(request)
+      response = http.request(request)
     rescue Timeout::Error
       #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
       failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
@@ -494,5 +507,5 @@ module UrlScrubber
       return [uri_str_new, response]
     end
   end
 end

data/lib/url_scrubber/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlScrubber
-  VERSION = "0.8.12"
+  VERSION = "0.8.13"
 end

data/spec/url_scrubber_spec.rb CHANGED Viewed

@@ -69,6 +69,10 @@ describe UrlScrubber do
       it "should transform user statuses into that user's profile" do
         UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
       end
+      it "should drop the query part of the url" do
+        UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
+      end
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: url_scrubber
 version: !ruby/object:Gem::Version
-  version: 0.8.12
+  version: 0.8.13
 platform: ruby
 authors:
 - Colin Langton
@@ -11,76 +11,76 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-05-20 00:00:00.000000000 Z
+date: 2018-11-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 2.11.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 2.11.0
 - !ruby/object:Gem::Dependency
   name: guard-bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.1.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.1.3
 - !ruby/object:Gem::Dependency
   name: guard-rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.4.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.4.3
 - !ruby/object:Gem::Dependency
   name: terminal-notifier-guard
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rb-fsevent
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.9.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 0.9.1
 description: Remove extraneous bits from URLs, follow redirects, identify social media
@@ -94,8 +94,8 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .rvmrc
+- ".gitignore"
+- ".rvmrc"
 - Gemfile
 - Guardfile
 - README.md
@@ -114,17 +114,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.7.7
 signing_key:
 specification_version: 4
 summary: Clean up URLs.