RubyGems - url_finder - Versions diffs - 0.1.0 → 0.2.0 - Mend

url_finder 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/README.md +5 -5
data/exe/url_finder +6 -1
data/lib/url_finder.rb +2 -2
data/lib/url_finder/reader.rb +3 -2
data/lib/url_finder/readers/html_reader.rb +4 -2
data/lib/url_finder/readers/markdown_reader.rb +4 -2
data/lib/url_finder/readers/sitemap_reader.rb +79 -0
data/lib/url_finder/version.rb +1 -1
data/url_finder.gemspec +2 -2
metadata +6 -7
data/.byebug_history +0 -3
data/lib/url_finder/readers/csv_reader.rb +0 -15

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7514dce4f92bf06c8d4de7ea64778e43f5edde852e529365c7e55b61353df6cc
-  data.tar.gz: 563a39ca2f31130388a9ed2bedb56417f49196c0e11fdb31f1d83aaf99df6485
+  metadata.gz: cb4f68c79b917473804be0fdc5ff776f3f816571b9aafb7358fd746211af05c0
+  data.tar.gz: d17bbf2b35810cfa9d462a9a0a708df565d71868187b20b0735cd55b1d01112e
 SHA512:
-  metadata.gz: d78fb982fa14ae0006752baa5d82441b09b3a676305405c42cc662794841c39fbb9aa679cb01f42e111d6e467cedb91b5233d568bd360bf005bd77c6580b3cc9
-  data.tar.gz: 805618369adfbb8bf1ab2ae067cb1f19392ee6b30cc58b36d21318924e3d6a8023b0df8f0302b291140c3e6b2bb9fa7f8d38e9f84fc2e89e5f30328202f573ae
+  metadata.gz: 74b27c7b82404d569a56d141351c4712f4dc112033ce63707bbf3dbd35617526137a002e52a485d2f9673679b62b6fa20134c6c09bf7e504e2432c08a5e98d71
+  data.tar.gz: 42e499f66152503668664e9051c9b59543bc65dcd260fbfc78f23a762db0a9229803e3468e2839d7d197cb61fde2d171c4b7adcaae0e9eeb4d18b4de923bc144

data/.gitignore CHANGED

@@ -12,3 +12,5 @@
 # This is a library so don't include the lock file
 Gemfile.lock
+.byebug_history

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # UrlFinder
-Find URLs in various file formats - supports markdown, HTML, CSV and regular text.
+Find URLs in various file formats - supports markdown, HTML and regular text.
 - [Usage](#usage)
 - [CLI](#cli)
@@ -39,7 +39,7 @@ UrlFinder.from_file('README', 'md').each do |url|
 end
 ```
-Supported formats are `markdown` (aliased as `md`), `html`, `csv` and `string`.
+Supported formats are `markdown` (aliased as `md`), `html` and `string`.
 Find URLs in string
 ```ruby
@@ -54,15 +54,15 @@ end
 ```
 Usage: url_finder --help
         --file=/path/to/file         Input file
-        --format=file_format         Input format (html, markdown, csv, string)
+        --format=file_format         Input format (html, markdown, sitemap.xml, string)
     -h, --help                       How to use
 ```
 ## Wish list
-- Better CSV support
-  + Current support is extremely crude and makes a lot of assumptions about the file..
 - RDoc support
+- Sitemap.xml support
+- and..?
 ## Development

data/exe/url_finder CHANGED

@@ -16,10 +16,15 @@ OptionParser.new do |parser|
     options[:file] = string
   end
-  parser.on('--format=file_format', String, 'Input format (html, markdown, csv, string)') do |string|
+  parser.on('--format=file_format', String, 'Input format (html, markdown, sitemap.xml, string)') do |string|
     options[:format] = string
   end
+  parser.on('-v', '--version', 'Print version') do
+    puts "UrlFinder version #{UrlFinder::VERSION}"
+    exit
+  end
   parser.on('-h', '--help', 'How to use') do
     puts parser
     exit

data/lib/url_finder.rb CHANGED

@@ -9,7 +9,7 @@ module UrlFinder
   # @param [String] path to file
   # @param [String] file_format
   #   of file if nil file format will tried to be infered from
-  #   file extension (markdown, html, csv, string)
+  #   file extension (markdown, html, string)
   def self.from_file(path, file_format = nil)
     file_format ||= path.split('.').last
@@ -18,7 +18,7 @@ module UrlFinder
   # Find URLs in string
   # @param [String] content string
-  # @param [String] file_format of string (markdown, html, csv, string)
+  # @param [String] file_format of string (markdown, html, string)
   def self.from(content, file_format)
     Reader.new(content, file_format).urls
   end

data/lib/url_finder/reader.rb CHANGED

@@ -1,8 +1,8 @@
 # frozen_string_literal: true
-require 'url_finder/readers/csv_reader'
 require 'url_finder/readers/html_reader'
 require 'url_finder/readers/markdown_reader'
+require 'url_finder/readers/sitemap_reader'
 require 'url_finder/readers/string_reader'
 module UrlFinder
@@ -12,7 +12,8 @@ module UrlFinder
       'markdown' => MarkdownReader,
       'md' => MarkdownReader,
       'html' => HTMLReader,
-      'csv' => CSVReader,
+      'sitemap' => SitemapReader,
+      'sitemap.xml' => SitemapReader,
       'string' => StringReader,
       'txt' => StringReader,
     }.freeze

data/lib/url_finder/readers/html_reader.rb CHANGED

@@ -9,8 +9,10 @@ module UrlFinder
     # Returns the found URLs
     # @return [Array<String>] the found URLs
     def urls
-      document = Nokogiri::HTML(content)
-      @urls ||= document.css('a').map { |e| e['href'] }.compact
+      @urls ||= begin
+        document = Nokogiri::HTML(content)
+        document.css('a').map { |e| e['href'] }.compact
+      end
     end
   end
 end

data/lib/url_finder/readers/markdown_reader.rb CHANGED

@@ -10,8 +10,10 @@ module UrlFinder
     # Returns the found URLs
     # @return [Array<String>] the found URLs
     def urls
-      html = Kramdown::Document.new(content).to_html
-      @urls ||= HTMLReader.new(html).urls
+      @urls ||= begin
+        html = Kramdown::Document.new(content).to_html
+        HTMLReader.new(html).urls
+      end
     end
   end
 end

data/lib/url_finder/readers/sitemap_reader.rb ADDED

@@ -0,0 +1,79 @@
+require 'rexml/document'
+module UrlFinder
+  # Parse Sitemaps, https://www.sitemaps.org
+  class SitemapReader < BaseReader
+    # Return all URLs defined in Sitemap.
+    # @return [Array<String>] of URLs defined in Sitemap.
+    # @example Get URLs defined in Sitemap
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.urls
+    def urls
+      @urls ||= extract_urls('url')
+    end
+    # The XML document
+    # @return [REXML::Document] the XML document
+    def document
+      @document ||= begin
+        REXML::Document.new(content)
+      rescue REXML::ParseException => _e
+        REXML::Document.new('')
+      end
+    end
+    # Return all sitemap URLs defined in Sitemap.
+    # @return [Array<String>] of Sitemap URLs defined in Sitemap.
+    # @example Get Sitemap URLs defined in Sitemap
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.sitemaps
+    def sitemaps
+      @sitemaps ||= extract_urls('sitemap')
+    end
+    # Check if sitemap is a plain file
+    # @return [Boolean] whether document is plain
+    def plain_document?
+      document.elements.empty?
+    end
+    # Return the name of the document (if there is one)
+    # @return [String] the document root name
+    def root_name
+      return unless document.root
+      document.root.name
+    end
+    # Returns true of Sitemap is a Sitemap index
+    # @return [Boolean] of whether the Sitemap is an Sitemap index or not
+    # @example Check if Sitemap is a sitemap index
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.sitemap_index?
+    def sitemap_index?
+      root_name == 'sitemapindex'
+    end
+    # Returns true of Sitemap lists regular URLs
+    # @return [Boolean] of whether the Sitemap regular URL list
+    # @example Check if Sitemap is a regular URL list
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.urlset?
+    def urlset?
+      root_name == 'urlset'
+    end
+    private
+    # Extract URLs from Sitemap
+    def extract_urls(node_name)
+      return document.to_s.each_line.map(&:strip) if plain_document?
+      urls = []
+      document.root.elements.each("#{node_name}/loc") do |element|
+        urls << element.text
+      end
+      urls
+    end
+  end
+end

data/lib/url_finder/version.rb CHANGED

@@ -2,5 +2,5 @@
 module UrlFinder
   # Gem version
-  VERSION = '0.1.0'.freeze
+  VERSION = '0.2.0'.freeze
 end

data/url_finder.gemspec CHANGED

@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
   spec.authors       = ['Jacob Burenstam']
   spec.email         = ['burenstam@gmail.com']
-  spec.summary       = 'Find URLs in common file formats (Markdown, HTML, CSV, string).'
-  spec.description   = 'Find URLs in common file formats (Markdown, HTML, CSV, string) with ease - Ruby and CLI.'
+  spec.summary       = 'Find URLs in common file formats (Markdown, HTML, string).'
+  spec.description   = 'Find URLs in common file formats (Markdown, HTML, string) with ease - Ruby and CLI.'
   spec.homepage      = 'https://github.com/buren/url_finder'
   spec.license       = 'MIT'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: url_finder
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Jacob Burenstam
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-08-19 00:00:00.000000000 Z
+date: 2018-08-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: kramdown
@@ -108,8 +108,8 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
-description: Find URLs in common file formats (Markdown, HTML, CSV, string) with ease
-  - Ruby and CLI.
+description: Find URLs in common file formats (Markdown, HTML, string) with ease -
+  Ruby and CLI.
 email:
 - burenstam@gmail.com
 executables:
@@ -117,7 +117,6 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- ".byebug_history"
 - ".gitignore"
 - ".rspec"
 - ".rubocop.yml"
@@ -133,9 +132,9 @@ files:
 - lib/url_finder.rb
 - lib/url_finder/reader.rb
 - lib/url_finder/readers/base_reader.rb
-- lib/url_finder/readers/csv_reader.rb
 - lib/url_finder/readers/html_reader.rb
 - lib/url_finder/readers/markdown_reader.rb
+- lib/url_finder/readers/sitemap_reader.rb
 - lib/url_finder/readers/string_reader.rb
 - lib/url_finder/version.rb
 - url_finder.gemspec
@@ -162,5 +161,5 @@ rubyforge_project:
 rubygems_version: 2.7.6
 signing_key:
 specification_version: 4
-summary: Find URLs in common file formats (Markdown, HTML, CSV, string).
+summary: Find URLs in common file formats (Markdown, HTML, string).
 test_files: []

data/.byebug_history DELETED

@@ -1,3 +0,0 @@
-exit
-finder.urls.class
-finder.urls

data/lib/url_finder/readers/csv_reader.rb DELETED

@@ -1,15 +0,0 @@
-# frozen_string_literal: true
-require 'csv'
-require 'url_finder/readers/base_reader'
-module UrlFinder
-  # Find URLs in CSV string
-  class CSVReader < BaseReader
-    # Returns the found URLs
-    # @return [Array<String>] the found URLs
-    def urls
-      @urls ||= CSV.parse(content).map(&:first).compact
-    end
-  end
-end