RubyGems - url_common - Versions diffs - 0.1.0 → 0.1.1 - Mend

url_common 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4f3290f0cba2dcd19ebc741320e0153f2ba5065f927914f850f956d675fe8752
-  data.tar.gz: f93a6899d9b39729db16140698c35885c4b000710cc9b5e1dc635dc9a21467aa
+  metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
+  data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
 SHA512:
-  metadata.gz: a534233ebf72a903303eb4b273459ec717d81fb4acff7daafbfe57f368be7c44b06d9fd1773561506c0b1957b1eac61e5012ff9d582743245b100009e8feaa72
-  data.tar.gz: 5b2efc559ad70767383a9fd35a458ef524713bb12c1c1c0c3393527e98a7ccde3ab32480793a2a91cd23c7bbbefffaaa3e16f12ca7656c75f7920621ceb9fc37
+  metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
+  data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44

data/Gemfile CHANGED Viewed

@@ -3,6 +3,8 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in url_common.gemspec
 gemspec
+ruby "2.7.1"
 gem "rake", "~> 12.0"
 gem "rspec", "~> 3.0"
 gem "fuzzyurl", '~> 0.9.0'

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,72 @@
+PATH
+  remote: .
+  specs:
+    url_common (0.1.1)
+      fuzzyurl (~> 0.9.0)
+      mechanize (~> 2.6)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    byebug (11.1.3)
+    connection_pool (2.2.3)
+    diff-lcs (1.4.4)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    fuzzyurl (0.9.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    mechanize (2.7.6)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (>= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (3.3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2020.0512)
+    mini_portile2 (2.4.0)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (4.0.0)
+      connection_pool (~> 2.2)
+    nokogiri (1.10.10)
+      mini_portile2 (~> 2.4.0)
+    ntlm-http (0.1.1)
+    rake (12.3.3)
+    rspec (3.9.0)
+      rspec-core (~> 3.9.0)
+      rspec-expectations (~> 3.9.0)
+      rspec-mocks (~> 3.9.0)
+    rspec-core (3.9.2)
+      rspec-support (~> 3.9.3)
+    rspec-expectations (3.9.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-mocks (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-support (3.9.3)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.7)
+    webrobots (0.1.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  byebug
+  fuzzyurl (~> 0.9.0)
+  mechanize (~> 2.6)
+  rake (~> 12.0)
+  rspec (~> 3.0)
+  url_common!
+RUBY VERSION
+   ruby 2.7.1p83
+BUNDLED WITH
+   2.1.4

data/README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # UrlCommon
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/url_common`. To experiment with that code, run `bin/console` for an interactive prompt.
+This is a gem for performing common Url centric things.  I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system.  Finally I'm creating a gem out of it to aid in its use across projects.
-TODO: Delete this and the text above, and describe your gem
+I don't claim that these are great, perfect, etc.  I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
 ## Installation
@@ -24,6 +24,8 @@ Or install it yourself as:
 TODO: Write usage instructions here
+This is a todo.
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/url_common/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlCommon
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

data/lib/url_common.rb CHANGED Viewed

@@ -266,4 +266,235 @@ module UrlCommon
     return mechanize_page
   end
+  #TODO needs tests
+  def self.get_meta_description(url, html)
+    page = UrlCommon.create_mechanize_page_from_html(url, html)
+    description = ""
+    begin
+      description = page.parser.at("meta[name='description']")['content']
+    rescue StandardError => e
+    end
+    return description
+  end
+  #TODO needs tests
+  def self.get_page_title(url, html)
+    page = UrlCommon.create_mechanize_page_from_html(url, html)
+    title = ""
+    begin
+      title = page.parser.css('title').first.content
+    rescue StandardError => e
+    end
+    return title
+  end
+  def self.extract_links_from_text(text)
+    agent = Mechanize.new
+    html = "<HTML><BODY>#{text}</BODY></HTML>"
+    page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
+    return page.links
+  end
+  # https://docs.aylien.com/textapi/#using-the-api
+  def self.summarize_url(url)
+    #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
+    agent = Mechanize.new
+    summarization_url = ""
+    page = agent.get(url)
+  end
+  # fucking idiotic test case for this fucking idiot is: https://devslopes.com/
+  def self.test_random_url(url_or_host)
+    random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
+    if url_or_host =~ /http/
+      url = File.join(url_or_host, random_filename)
+    else
+      url = File.join("http://", host, random_filename)
+    end
+    status, url = UrlCommon.check_for_404(url, true)
+    #
+    # Key bit of logic -- if we get a return value for a randomized sha then that means that
+    # a) the destination site owner is a fucking moron
+    # b) that the destination site owner has set his site so it NEVER returns a 404
+    # c) they're a fucking moron
+    # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
+    #    a proper 404 so need to flip flop the logic and return error on a 200; sheesh
+    #
+    return :error, url if status == :ok
+    return :ok, url
+  end
+  def self.select_best_rssurl_from_rssurls(urls)
+    return urls.sort_by(&:length).first
+  end
+  def self.possible_rssurls(site_url, skip_slash_blog = false)
+    # urls we will probe
+    possible_rssurl_formats = []
+    # normal baselines
+    possible_rssurl_formats << "feed.xml"
+    possible_rssurl_formats << "rss.xml"
+    possible_rssurl_formats << "atom.xml"
+    possible_rssurl_formats << "feed/"
+    # optionally look at /blog/
+    possible_rssurl_formats << "/blog/feed.xml"
+    possible_rssurl_formats << "/blog/rss.xml"
+    possible_rssurl_formats << "/blog/atom.xml"
+    possible_rssurl_formats << "/blog/feed/"
+    possible_rssurls = []
+    possible_rssurl_formats.each do |url_format|
+      possible_rssurls << UrlCommon.join(site_url, url_format)
+    end
+    return  possible_rssurls
+  end
+  def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
+    if page
+      status = :ok
+    else
+      status, page = UrlCommon.get_page(site_url)
+    end
+    puts "Into html parse for rssurl" if debug
+    possibles = []
+    if status == :ok && page
+      #results = page.css("link[rel='alternate']")
+      results = page.css("link[rel='alternate'][type='application/rss+xml']")
+      #
+      # If only a single one then return it
+      #
+      #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
+      return results.first['href'] if results.size == 1
+      #
+      # If an array then filter out the comments
+      #
+      results.each do |result|
+        possibles << result unless result['title'] =~ /comments? feed/i
+      end
+      #
+      # Loop over the possibles and just return the shortest url
+      #
+      # Todo -- can likely do a better job on this
+      #
+      urls = []
+      possibles.each do |possible|
+        urls << possible['href']
+      end
+      return UrlCommon.select_best_rssurl_from_rssurls(urls)
+      #return urls.sort_by(&:length).first
+      # results.each do |result|
+      #
+      #   end
+      # end
+      # doc = Nokogiri::HTML(page.body)
+      # results << doc.at('link[rel="alternate"]')
+      # results = results.flatten
+    end
+  end
+  def self.get_protocol(url)
+    parts = url.to_s.split(":")
+    return parts.first
+  end
+  #https://500hats.com/feed
+  # UrlCommon.discover_feed_url("https://nickjanetakis.com")
+  def self.discover_feed_url(site_url, debug = false)
+    # step 1: remove the file from the site_url if it has one
+    # step 2: problem the common ones and 404 check
+    #
+    # Build a set of possibles
+    #
+    possible_rssurls = UrlCommon.possible_rssurls(site_url)
+    #
+    # Keep track of failures
+    #
+    failed_probes = Set.new
+    # step 3: parse the html
+    #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
+    #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
+    #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
+    #
+    # Stage 1 -- do http head probing
+    #
+    possible_rssurls.each do |rssurl|
+      puts "Head Probing for: #{rssurl}" if debug
+      # abort if we doubled blog i.e. /blog/blog/ in the url
+      next if rssurl =~ /blog\/blog/
+      next if failed_probes.include?(rssurl)
+      status, url = UrlCommon.check_for_404(rssurl, true)
+      random_status, random_url = UrlCommon.test_random_url(site_url)
+      #debugger
+      return rssurl if status == :ok && random_status == :ok
+      failed_probes << rssurl
+    end
+    puts "After probe, failed_probes as: #{failed_probes.inspect}"
+    #
+    # Stage 2-- if subdirectory go up one level and probe again
+    #
+    # TODO
+    #
+    # Stage 3 -- Goto root and probe again
+    #
+    #test for this is the nick site
+    fuzzy_url_parts = Fuzzyurl.new(site_url)
+    base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
+    possible_rssurls = UrlCommon.possible_rssurls(base_url)
+    #debugger
+    possible_rssurls.each do |rssurl|
+      puts "Head Probing for: #{rssurl} at site root stage" #if debug
+      # abort if we doubled blog i.e. /blog/blog/ in the url
+      next if rssurl =~ /blog\/blog/
+      next if failed_probes.include?(rssurl)
+      status, url = UrlCommon.check_for_404(rssurl, true)
+      return rssurl if status == :ok
+      failed_probes << rssurl
+    end
+    #
+    # Stage 4 - parse the html
+    #
+    rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
+    return rssurl if rssurl
+    #
+    # Stage 5 - fall over to feedback
+    #
+    results = Feedbag.find(site_url)
+    # checked_results = []
+    # results.each do |result|
+    #   struct = UrlCommon.check_for_404(result)
+    #   checked_results << result if struct.status == 200
+    # end
+    #
+    # Stage 6 - cache failures to redis so don't look for them again
+    #
+    #$redis.
+    return UrlCommon.select_best_rssurl_from_rssurls(results)
+  end
 end

data/url_common.gemspec CHANGED Viewed

@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
+  spec.add_dependency 'fuzzyurl', '~> 0.9.0'
+  spec.add_dependency 'mechanize', '~> 2.6'
 end

metadata CHANGED Viewed

@@ -1,15 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: url_common
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Scott Johnson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-08-12 00:00:00.000000000 Z
-dependencies: []
+date: 2022-06-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: fuzzyurl
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.0
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.6'
 description: This is a class library for common url manipulation and crawling tasks.  It
   is based on a career focused on the practical side of working with the Internet
   using Ruby.
@@ -24,6 +52,7 @@ files:
 - ".travis.yml"
 - CODE_OF_CONDUCT.md
 - Gemfile
+- Gemfile.lock
 - LICENSE.txt
 - README.md
 - Rakefile