RubyGems - url_common - Versions diffs - 0.1.0 → 0.1.3 - Mend

url_common 0.1.0 → 0.1.3

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4f3290f0cba2dcd19ebc741320e0153f2ba5065f927914f850f956d675fe8752
-  data.tar.gz: f93a6899d9b39729db16140698c35885c4b000710cc9b5e1dc635dc9a21467aa
+  metadata.gz: 0ab8c804a828390106ee6e442cd46362ef5648ff9d41d36317f2644cb9f21e15
+  data.tar.gz: 7d6068a766ed82a3c4fe724fe426549c9c56047acbf373eb46d87aba763dfab0
 SHA512:
-  metadata.gz: a534233ebf72a903303eb4b273459ec717d81fb4acff7daafbfe57f368be7c44b06d9fd1773561506c0b1957b1eac61e5012ff9d582743245b100009e8feaa72
-  data.tar.gz: 5b2efc559ad70767383a9fd35a458ef524713bb12c1c1c0c3393527e98a7ccde3ab32480793a2a91cd23c7bbbefffaaa3e16f12ca7656c75f7920621ceb9fc37
+  metadata.gz: bddc296b7dae2781ec1ae7f8b8c79a1327e93bc1bfe30c761366033ef0882049948ea8c2ebbd7ce425aa7d437086b7ec38a4b30ebf0e976a4213990ef1e2d2b6
+  data.tar.gz: 395a2d6fec4519067d4dbb08f78c3e67cd4b1d2b1c7c8083299398790c5b9f9e6db81aade8c093f6b2eb39975f30b6923a80634d928e9c6fad159e12bb6a840d

data/Gemfile CHANGED Viewed

@@ -3,8 +3,15 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in url_common.gemspec
 gemspec
+ruby "3.1.2"
 gem "rake", "~> 12.0"
 gem "rspec", "~> 3.0"
 gem "fuzzyurl", '~> 0.9.0'
-gem 'mechanize', '~> 2.6'
 gem "byebug"
+gem "hpricot", "~> 0.8.6"
+gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
+gem "mechanize", "~> 2.7"
+gem "webrick", "~> 1.7"

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,82 @@
+GIT
+  remote: https://github.com/drbrain/net-http-persistent.git
+  revision: 857c3baaa541644fa437328b535042a500414119
+  specs:
+    net-http-persistent (4.0.1)
+      connection_pool (~> 2.2)
+PATH
+  remote: .
+  specs:
+    url_common (0.1.3)
+      fuzzyurl (~> 0.9.0)
+      mechanize (~> 2.6)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    byebug (11.1.3)
+    connection_pool (2.2.5)
+    diff-lcs (1.4.4)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    fuzzyurl (0.9.0)
+    hpricot (0.8.6)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    mechanize (2.7.6)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (>= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (3.3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2020.0512)
+    mini_portile2 (2.4.0)
+    net-http-digest_auth (1.4.1)
+    nokogiri (1.10.10)
+      mini_portile2 (~> 2.4.0)
+    ntlm-http (0.1.1)
+    rake (12.3.3)
+    rspec (3.9.0)
+      rspec-core (~> 3.9.0)
+      rspec-expectations (~> 3.9.0)
+      rspec-mocks (~> 3.9.0)
+    rspec-core (3.9.2)
+      rspec-support (~> 3.9.3)
+    rspec-expectations (3.9.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-mocks (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-support (3.9.3)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.7)
+    webrick (1.7.0)
+    webrobots (0.1.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  byebug
+  fuzzyurl (~> 0.9.0)
+  hpricot (~> 0.8.6)
+  mechanize (~> 2.7)
+  net-http-persistent!
+  rake (~> 12.0)
+  rspec (~> 3.0)
+  url_common!
+  webrick (~> 1.7)
+RUBY VERSION
+   ruby 3.1.2p20
+BUNDLED WITH
+   2.1.4

data/README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # UrlCommon
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/url_common`. To experiment with that code, run `bin/console` for an interactive prompt.
+This is a gem for performing common Url centric things.  I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system.  Finally I'm creating a gem out of it to aid in its use across projects.
-TODO: Delete this and the text above, and describe your gem
+I don't claim that these are great, perfect, etc.  I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
 ## Installation
@@ -24,6 +24,8 @@ Or install it yourself as:
 TODO: Write usage instructions here
+This is a todo.
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/url_common/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlCommon
-  VERSION = "0.1.0"
+  VERSION = "0.1.3"
 end

data/lib/url_common.rb CHANGED Viewed

@@ -30,6 +30,17 @@ module UrlCommon
     end
   end
+  # UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
+  def self.parse_fid_from_amazon_url(url)
+    tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
+    if tmp && tmp[1]
+      return tmp[1]
+    else
+      return nil
+    end
+  end
   def self.parse_country_from_itunes_url(url)
     country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
     if country
@@ -39,9 +50,22 @@ module UrlCommon
     return 'us'
   end
+  # original
+  # def self.get_base_domain(url)
+  #   parts = URI.parse(url)
+  #   return parts.host.gsub(/^www./,'')
+  # end
   def self.get_base_domain(url)
-    parts = URI.parse(url)
-    return parts.host.gsub(/^www./,'')
+    #debugger if url =~ /c06rh22whx1g/
+    begin
+      url = url.gsub(/ /,'%20')
+      parts = URI.parse(url)
+      return parts.host.gsub(/^www./,'')
+    rescue StandardError => e
+      fu = Fuzzyurl.from_string(url)
+      return fu.hostname.gsub(/^www./,'')
+    end
   end
   def self.join(base, rest, debug = false)
@@ -60,9 +84,19 @@ module UrlCommon
     end
   end
-  #TODO
   def self.count_links(html)
-    return 0
+    if html =~ /<html/i
+      content_type = "html"
+    else
+      content_type = "ascii"
+    end
+    parts = html.split(" ")
+    link_ctr = 0
+    parts.each do |part|
+      link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
+      link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
+    end
+    link_ctr
   end
   def self.agent
@@ -82,13 +116,23 @@ module UrlCommon
   #
   def self.url_base(url, base_domain=nil)
     if base_domain.nil?
-      base_domain = get_base_domain(url)
+      base_domain = UrlCommon.get_base_domain(url)
+    end
+    begin
+      url = url.gsub(/ /,'%20')
+      parts = URI.parse(url)
+      extra = ""
+      extra = "?#{parts.query}" if parts.query
+      url_base = "#{base_domain}#{parts.path}#{extra}"
+      return url_base[0..254]
+    rescue StandardError => e
+      fu = Fuzzyurl.from_string(url)
+      base_domain = UrlCommon.get_base_domain(url)
+      extra = ""
+      extra = "?#{fu.query}" if fu.query
+      url_base = "#{base_domain}#{fu.path}#{extra}"
+      return url_base[0..254]
     end
-    parts = URI.parse(url)
-    extra = ""
-    extra = "?#{parts.query}" if parts.query
-    url_base = "#{base_domain}#{parts.path}#{extra}"
-    return url_base[0..254]
   end
   #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +306,241 @@ module UrlCommon
   #TODO needs tests
   def self.create_mechanize_page_from_html(url, html)
     mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
+    url = url.gsub(/ /,'%20')
     mechanize_page.uri = URI.parse(url)
     return mechanize_page
   end
+  #TODO needs tests
+  def self.get_meta_description(url, html)
+    page = UrlCommon.create_mechanize_page_from_html(url, html)
+    description = ""
+    begin
+      description = page.parser.at("meta[name='description']")['content']
+    rescue StandardError => e
+    end
+    return description
+  end
+  #TODO needs tests
+  # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
+  def self.get_page_title(url, html)
+    page = UrlCommon.create_mechanize_page_from_html(url, html)
+    title = ""
+    begin
+      title = page.parser.css('title').first.content
+    rescue StandardError => e
+    end
+    return title
+  end
+  def self.extract_links_from_text(text)
+    agent = Mechanize.new
+    html = "<HTML><BODY>#{text}</BODY></HTML>"
+    page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
+    return page.links
+  end
+  # https://docs.aylien.com/textapi/#using-the-api
+  def self.summarize_url(url)
+    #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
+    agent = Mechanize.new
+    summarization_url = ""
+    page = agent.get(url)
+  end
+  # fucking idiotic test case for this fucking idiot is: https://devslopes.com/
+  def self.test_random_url(url_or_host)
+    random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
+    if url_or_host =~ /http/
+      url = File.join(url_or_host, random_filename)
+    else
+      url = File.join("http://", host, random_filename)
+    end
+    status, url = UrlCommon.check_for_404(url, true)
+    #
+    # Key bit of logic -- if we get a return value for a randomized sha then that means that
+    # a) the destination site owner is a fucking moron
+    # b) that the destination site owner has set his site so it NEVER returns a 404
+    # c) they're a fucking moron
+    # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
+    #    a proper 404 so need to flip flop the logic and return error on a 200; sheesh
+    #
+    return :error, url if status == :ok
+    return :ok, url
+  end
+  def self.select_best_rssurl_from_rssurls(urls)
+    return urls.sort_by(&:length).first
+  end
+  def self.possible_rssurls(site_url, skip_slash_blog = false)
+    # urls we will probe
+    possible_rssurl_formats = []
+    # normal baselines
+    possible_rssurl_formats << "feed.xml"
+    possible_rssurl_formats << "rss.xml"
+    possible_rssurl_formats << "atom.xml"
+    possible_rssurl_formats << "feed/"
+    # optionally look at /blog/
+    possible_rssurl_formats << "/blog/feed.xml"
+    possible_rssurl_formats << "/blog/rss.xml"
+    possible_rssurl_formats << "/blog/atom.xml"
+    possible_rssurl_formats << "/blog/feed/"
+    possible_rssurls = []
+    possible_rssurl_formats.each do |url_format|
+      possible_rssurls << UrlCommon.join(site_url, url_format)
+    end
+    return  possible_rssurls
+  end
+  def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
+    if page
+      status = :ok
+    else
+      status, page = UrlCommon.get_page(site_url)
+    end
+    puts "Into html parse for rssurl" if debug
+    possibles = []
+    if status == :ok && page
+      #results = page.css("link[rel='alternate']")
+      results = page.css("link[rel='alternate'][type='application/rss+xml']")
+      #
+      # If only a single one then return it
+      #
+      #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
+      return results.first['href'] if results.size == 1
+      #
+      # If an array then filter out the comments
+      #
+      results.each do |result|
+        possibles << result unless result['title'] =~ /comments? feed/i
+      end
+      #
+      # Loop over the possibles and just return the shortest url
+      #
+      # Todo -- can likely do a better job on this
+      #
+      urls = []
+      possibles.each do |possible|
+        urls << possible['href']
+      end
+      return UrlCommon.select_best_rssurl_from_rssurls(urls)
+      #return urls.sort_by(&:length).first
+      # results.each do |result|
+      #
+      #   end
+      # end
+      # doc = Nokogiri::HTML(page.body)
+      # results << doc.at('link[rel="alternate"]')
+      # results = results.flatten
+    end
+  end
+  def self.get_protocol(url)
+    parts = url.to_s.split(":")
+    return parts.first
+  end
+  #https://500hats.com/feed
+  # UrlCommon.discover_feed_url("https://nickjanetakis.com")
+  def self.discover_feed_url(site_url, debug = false)
+    # step 1: remove the file from the site_url if it has one
+    # step 2: problem the common ones and 404 check
+    #
+    # Build a set of possibles
+    #
+    possible_rssurls = UrlCommon.possible_rssurls(site_url)
+    #
+    # Keep track of failures
+    #
+    failed_probes = Set.new
+    # step 3: parse the html
+    #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
+    #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
+    #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
+    #
+    # Stage 1 -- do http head probing
+    #
+    possible_rssurls.each do |rssurl|
+      puts "Head Probing for: #{rssurl}" if debug
+      # abort if we doubled blog i.e. /blog/blog/ in the url
+      next if rssurl =~ /blog\/blog/
+      next if failed_probes.include?(rssurl)
+      status, url = UrlCommon.check_for_404(rssurl, true)
+      random_status, random_url = UrlCommon.test_random_url(site_url)
+      #debugger
+      return rssurl if status == :ok && random_status == :ok
+      failed_probes << rssurl
+    end
+    puts "After probe, failed_probes as: #{failed_probes.inspect}"
+    #
+    # Stage 2-- if subdirectory go up one level and probe again
+    #
+    # TODO
+    #
+    # Stage 3 -- Goto root and probe again
+    #
+    #test for this is the nick site
+    fuzzy_url_parts = Fuzzyurl.new(site_url)
+    base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
+    possible_rssurls = UrlCommon.possible_rssurls(base_url)
+    #debugger
+    possible_rssurls.each do |rssurl|
+      puts "Head Probing for: #{rssurl} at site root stage" #if debug
+      # abort if we doubled blog i.e. /blog/blog/ in the url
+      next if rssurl =~ /blog\/blog/
+      next if failed_probes.include?(rssurl)
+      status, url = UrlCommon.check_for_404(rssurl, true)
+      return rssurl if status == :ok
+      failed_probes << rssurl
+    end
+    #
+    # Stage 4 - parse the html
+    #
+    rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
+    return rssurl if rssurl
+    #
+    # Stage 5 - fall over to feedback
+    #
+    results = Feedbag.find(site_url)
+    # checked_results = []
+    # results.each do |result|
+    #   struct = UrlCommon.check_for_404(result)
+    #   checked_results << result if struct.status == 200
+    # end
+    #
+    # Stage 6 - cache failures to redis so don't look for them again
+    #
+    #$redis.
+    return UrlCommon.select_best_rssurl_from_rssurls(results)
+  end
 end

data/url_common.gemspec CHANGED Viewed

@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
+  spec.add_dependency 'fuzzyurl', '~> 0.9.0'
+  spec.add_dependency 'mechanize', '~> 2.6'
 end

metadata CHANGED Viewed

@@ -1,15 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: url_common
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.3
 platform: ruby
 authors:
 - Scott Johnson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-08-12 00:00:00.000000000 Z
-dependencies: []
+date: 2022-07-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: fuzzyurl
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.0
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.6'
 description: This is a class library for common url manipulation and crawling tasks.  It
   is based on a career focused on the practical side of working with the Internet
   using Ruby.
@@ -24,6 +52,7 @@ files:
 - ".travis.yml"
 - CODE_OF_CONDUCT.md
 - Gemfile
+- Gemfile.lock
 - LICENSE.txt
 - README.md
 - Rakefile
@@ -55,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.2
+rubygems_version: 3.3.7
 signing_key:
 specification_version: 4
 summary: This is a class library designed for common url manipulation and crawling