url_finder 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7514dce4f92bf06c8d4de7ea64778e43f5edde852e529365c7e55b61353df6cc
4
- data.tar.gz: 563a39ca2f31130388a9ed2bedb56417f49196c0e11fdb31f1d83aaf99df6485
3
+ metadata.gz: cb4f68c79b917473804be0fdc5ff776f3f816571b9aafb7358fd746211af05c0
4
+ data.tar.gz: d17bbf2b35810cfa9d462a9a0a708df565d71868187b20b0735cd55b1d01112e
5
5
  SHA512:
6
- metadata.gz: d78fb982fa14ae0006752baa5d82441b09b3a676305405c42cc662794841c39fbb9aa679cb01f42e111d6e467cedb91b5233d568bd360bf005bd77c6580b3cc9
7
- data.tar.gz: 805618369adfbb8bf1ab2ae067cb1f19392ee6b30cc58b36d21318924e3d6a8023b0df8f0302b291140c3e6b2bb9fa7f8d38e9f84fc2e89e5f30328202f573ae
6
+ metadata.gz: 74b27c7b82404d569a56d141351c4712f4dc112033ce63707bbf3dbd35617526137a002e52a485d2f9673679b62b6fa20134c6c09bf7e504e2432c08a5e98d71
7
+ data.tar.gz: 42e499f66152503668664e9051c9b59543bc65dcd260fbfc78f23a762db0a9229803e3468e2839d7d197cb61fde2d171c4b7adcaae0e9eeb4d18b4de923bc144
data/.gitignore CHANGED
@@ -12,3 +12,5 @@
12
12
 
13
13
  # This is a library so don't include the lock file
14
14
  Gemfile.lock
15
+
16
+ .byebug_history
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # UrlFinder
2
2
 
3
- Find URLs in various file formats - supports markdown, HTML, CSV and regular text.
3
+ Find URLs in various file formats - supports markdown, HTML and regular text.
4
4
 
5
5
  - [Usage](#usage)
6
6
  - [CLI](#cli)
@@ -39,7 +39,7 @@ UrlFinder.from_file('README', 'md').each do |url|
39
39
  end
40
40
  ```
41
41
 
42
- Supported formats are `markdown` (aliased as `md`), `html`, `csv` and `string`.
42
+ Supported formats are `markdown` (aliased as `md`), `html` and `string`.
43
43
 
44
44
  Find URLs in string
45
45
  ```ruby
@@ -54,15 +54,15 @@ end
54
54
  ```
55
55
  Usage: url_finder --help
56
56
  --file=/path/to/file Input file
57
- --format=file_format Input format (html, markdown, csv, string)
57
+ --format=file_format Input format (html, markdown, sitemap.xml, string)
58
58
  -h, --help How to use
59
59
  ```
60
60
 
61
61
  ## Wish list
62
62
 
63
- - Better CSV support
64
- + Current support is extremely crude and makes a lot of assumptions about the file..
65
63
  - RDoc support
64
+ - Sitemap.xml support
65
+ - and..?
66
66
 
67
67
  ## Development
68
68
 
@@ -16,10 +16,15 @@ OptionParser.new do |parser|
16
16
  options[:file] = string
17
17
  end
18
18
 
19
- parser.on('--format=file_format', String, 'Input format (html, markdown, csv, string)') do |string|
19
+ parser.on('--format=file_format', String, 'Input format (html, markdown, sitemap.xml, string)') do |string|
20
20
  options[:format] = string
21
21
  end
22
22
 
23
+ parser.on('-v', '--version', 'Print version') do
24
+ puts "UrlFinder version #{UrlFinder::VERSION}"
25
+ exit
26
+ end
27
+
23
28
  parser.on('-h', '--help', 'How to use') do
24
29
  puts parser
25
30
  exit
@@ -9,7 +9,7 @@ module UrlFinder
9
9
  # @param [String] path to file
10
10
  # @param [String] file_format
11
11
  # of file if nil file format will tried to be infered from
12
- # file extension (markdown, html, csv, string)
12
+ # file extension (markdown, html, string)
13
13
  def self.from_file(path, file_format = nil)
14
14
  file_format ||= path.split('.').last
15
15
 
@@ -18,7 +18,7 @@ module UrlFinder
18
18
 
19
19
  # Find URLs in string
20
20
  # @param [String] content string
21
- # @param [String] file_format of string (markdown, html, csv, string)
21
+ # @param [String] file_format of string (markdown, html, string)
22
22
  def self.from(content, file_format)
23
23
  Reader.new(content, file_format).urls
24
24
  end
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'url_finder/readers/csv_reader'
4
3
  require 'url_finder/readers/html_reader'
5
4
  require 'url_finder/readers/markdown_reader'
5
+ require 'url_finder/readers/sitemap_reader'
6
6
  require 'url_finder/readers/string_reader'
7
7
 
8
8
  module UrlFinder
@@ -12,7 +12,8 @@ module UrlFinder
12
12
  'markdown' => MarkdownReader,
13
13
  'md' => MarkdownReader,
14
14
  'html' => HTMLReader,
15
- 'csv' => CSVReader,
15
+ 'sitemap' => SitemapReader,
16
+ 'sitemap.xml' => SitemapReader,
16
17
  'string' => StringReader,
17
18
  'txt' => StringReader,
18
19
  }.freeze
@@ -9,8 +9,10 @@ module UrlFinder
9
9
  # Returns the found URLs
10
10
  # @return [Array<String>] the found URLs
11
11
  def urls
12
- document = Nokogiri::HTML(content)
13
- @urls ||= document.css('a').map { |e| e['href'] }.compact
12
+ @urls ||= begin
13
+ document = Nokogiri::HTML(content)
14
+ document.css('a').map { |e| e['href'] }.compact
15
+ end
14
16
  end
15
17
  end
16
18
  end
@@ -10,8 +10,10 @@ module UrlFinder
10
10
  # Returns the found URLs
11
11
  # @return [Array<String>] the found URLs
12
12
  def urls
13
- html = Kramdown::Document.new(content).to_html
14
- @urls ||= HTMLReader.new(html).urls
13
+ @urls ||= begin
14
+ html = Kramdown::Document.new(content).to_html
15
+ HTMLReader.new(html).urls
16
+ end
15
17
  end
16
18
  end
17
19
  end
@@ -0,0 +1,79 @@
1
+ require 'rexml/document'
2
+
3
+ module UrlFinder
4
+ # Parse Sitemaps, https://www.sitemaps.org
5
+ class SitemapReader < BaseReader
6
+ # Return all URLs defined in Sitemap.
7
+ # @return [Array<String>] of URLs defined in Sitemap.
8
+ # @example Get URLs defined in Sitemap
9
+ # sitemap = Sitemap.new(xml)
10
+ # sitemap.urls
11
+ def urls
12
+ @urls ||= extract_urls('url')
13
+ end
14
+
15
+ # The XML document
16
+ # @return [REXML::Document] the XML document
17
+ def document
18
+ @document ||= begin
19
+ REXML::Document.new(content)
20
+ rescue REXML::ParseException => _e
21
+ REXML::Document.new('')
22
+ end
23
+ end
24
+
25
+ # Return all sitemap URLs defined in Sitemap.
26
+ # @return [Array<String>] of Sitemap URLs defined in Sitemap.
27
+ # @example Get Sitemap URLs defined in Sitemap
28
+ # sitemap = Sitemap.new(xml)
29
+ # sitemap.sitemaps
30
+ def sitemaps
31
+ @sitemaps ||= extract_urls('sitemap')
32
+ end
33
+
34
+ # Check if sitemap is a plain file
35
+ # @return [Boolean] whether document is plain
36
+ def plain_document?
37
+ document.elements.empty?
38
+ end
39
+
40
+ # Return the name of the document (if there is one)
41
+ # @return [String] the document root name
42
+ def root_name
43
+ return unless document.root
44
+
45
+ document.root.name
46
+ end
47
+
48
+ # Returns true of Sitemap is a Sitemap index
49
+ # @return [Boolean] of whether the Sitemap is an Sitemap index or not
50
+ # @example Check if Sitemap is a sitemap index
51
+ # sitemap = Sitemap.new(xml)
52
+ # sitemap.sitemap_index?
53
+ def sitemap_index?
54
+ root_name == 'sitemapindex'
55
+ end
56
+
57
+ # Returns true of Sitemap lists regular URLs
58
+ # @return [Boolean] of whether the Sitemap regular URL list
59
+ # @example Check if Sitemap is a regular URL list
60
+ # sitemap = Sitemap.new(xml)
61
+ # sitemap.urlset?
62
+ def urlset?
63
+ root_name == 'urlset'
64
+ end
65
+
66
+ private
67
+
68
+ # Extract URLs from Sitemap
69
+ def extract_urls(node_name)
70
+ return document.to_s.each_line.map(&:strip) if plain_document?
71
+
72
+ urls = []
73
+ document.root.elements.each("#{node_name}/loc") do |element|
74
+ urls << element.text
75
+ end
76
+ urls
77
+ end
78
+ end
79
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module UrlFinder
4
4
  # Gem version
5
- VERSION = '0.1.0'.freeze
5
+ VERSION = '0.2.0'.freeze
6
6
  end
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ['Jacob Burenstam']
11
11
  spec.email = ['burenstam@gmail.com']
12
12
 
13
- spec.summary = 'Find URLs in common file formats (Markdown, HTML, CSV, string).'
14
- spec.description = 'Find URLs in common file formats (Markdown, HTML, CSV, string) with ease - Ruby and CLI.'
13
+ spec.summary = 'Find URLs in common file formats (Markdown, HTML, string).'
14
+ spec.description = 'Find URLs in common file formats (Markdown, HTML, string) with ease - Ruby and CLI.'
15
15
  spec.homepage = 'https://github.com/buren/url_finder'
16
16
  spec.license = 'MIT'
17
17
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-08-19 00:00:00.000000000 Z
11
+ date: 2018-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: kramdown
@@ -108,8 +108,8 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '3.0'
111
- description: Find URLs in common file formats (Markdown, HTML, CSV, string) with ease
112
- - Ruby and CLI.
111
+ description: Find URLs in common file formats (Markdown, HTML, string) with ease -
112
+ Ruby and CLI.
113
113
  email:
114
114
  - burenstam@gmail.com
115
115
  executables:
@@ -117,7 +117,6 @@ executables:
117
117
  extensions: []
118
118
  extra_rdoc_files: []
119
119
  files:
120
- - ".byebug_history"
121
120
  - ".gitignore"
122
121
  - ".rspec"
123
122
  - ".rubocop.yml"
@@ -133,9 +132,9 @@ files:
133
132
  - lib/url_finder.rb
134
133
  - lib/url_finder/reader.rb
135
134
  - lib/url_finder/readers/base_reader.rb
136
- - lib/url_finder/readers/csv_reader.rb
137
135
  - lib/url_finder/readers/html_reader.rb
138
136
  - lib/url_finder/readers/markdown_reader.rb
137
+ - lib/url_finder/readers/sitemap_reader.rb
139
138
  - lib/url_finder/readers/string_reader.rb
140
139
  - lib/url_finder/version.rb
141
140
  - url_finder.gemspec
@@ -162,5 +161,5 @@ rubyforge_project:
162
161
  rubygems_version: 2.7.6
163
162
  signing_key:
164
163
  specification_version: 4
165
- summary: Find URLs in common file formats (Markdown, HTML, CSV, string).
164
+ summary: Find URLs in common file formats (Markdown, HTML, string).
166
165
  test_files: []
@@ -1,3 +0,0 @@
1
- exit
2
- finder.urls.class
3
- finder.urls
@@ -1,15 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'csv'
4
- require 'url_finder/readers/base_reader'
5
-
6
- module UrlFinder
7
- # Find URLs in CSV string
8
- class CSVReader < BaseReader
9
- # Returns the found URLs
10
- # @return [Array<String>] the found URLs
11
- def urls
12
- @urls ||= CSV.parse(content).map(&:first).compact
13
- end
14
- end
15
- end