url_finder 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7514dce4f92bf06c8d4de7ea64778e43f5edde852e529365c7e55b61353df6cc
4
- data.tar.gz: 563a39ca2f31130388a9ed2bedb56417f49196c0e11fdb31f1d83aaf99df6485
3
+ metadata.gz: cb4f68c79b917473804be0fdc5ff776f3f816571b9aafb7358fd746211af05c0
4
+ data.tar.gz: d17bbf2b35810cfa9d462a9a0a708df565d71868187b20b0735cd55b1d01112e
5
5
  SHA512:
6
- metadata.gz: d78fb982fa14ae0006752baa5d82441b09b3a676305405c42cc662794841c39fbb9aa679cb01f42e111d6e467cedb91b5233d568bd360bf005bd77c6580b3cc9
7
- data.tar.gz: 805618369adfbb8bf1ab2ae067cb1f19392ee6b30cc58b36d21318924e3d6a8023b0df8f0302b291140c3e6b2bb9fa7f8d38e9f84fc2e89e5f30328202f573ae
6
+ metadata.gz: 74b27c7b82404d569a56d141351c4712f4dc112033ce63707bbf3dbd35617526137a002e52a485d2f9673679b62b6fa20134c6c09bf7e504e2432c08a5e98d71
7
+ data.tar.gz: 42e499f66152503668664e9051c9b59543bc65dcd260fbfc78f23a762db0a9229803e3468e2839d7d197cb61fde2d171c4b7adcaae0e9eeb4d18b4de923bc144
data/.gitignore CHANGED
@@ -12,3 +12,5 @@
12
12
 
13
13
  # This is a library so don't include the lock file
14
14
  Gemfile.lock
15
+
16
+ .byebug_history
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # UrlFinder
2
2
 
3
- Find URLs in various file formats - supports markdown, HTML, CSV and regular text.
3
+ Find URLs in various file formats - supports markdown, HTML and regular text.
4
4
 
5
5
  - [Usage](#usage)
6
6
  - [CLI](#cli)
@@ -39,7 +39,7 @@ UrlFinder.from_file('README', 'md').each do |url|
39
39
  end
40
40
  ```
41
41
 
42
- Supported formats are `markdown` (aliased as `md`), `html`, `csv` and `string`.
42
+ Supported formats are `markdown` (aliased as `md`), `html` and `string`.
43
43
 
44
44
  Find URLs in string
45
45
  ```ruby
@@ -54,15 +54,15 @@ end
54
54
  ```
55
55
  Usage: url_finder --help
56
56
  --file=/path/to/file Input file
57
- --format=file_format Input format (html, markdown, csv, string)
57
+ --format=file_format Input format (html, markdown, sitemap.xml, string)
58
58
  -h, --help How to use
59
59
  ```
60
60
 
61
61
  ## Wish list
62
62
 
63
- - Better CSV support
64
- + Current support is extremely crude and makes a lot of assumptions about the file..
65
63
  - RDoc support
64
+ - Sitemap.xml support
65
+ - and..?
66
66
 
67
67
  ## Development
68
68
 
@@ -16,10 +16,15 @@ OptionParser.new do |parser|
16
16
  options[:file] = string
17
17
  end
18
18
 
19
- parser.on('--format=file_format', String, 'Input format (html, markdown, csv, string)') do |string|
19
+ parser.on('--format=file_format', String, 'Input format (html, markdown, sitemap.xml, string)') do |string|
20
20
  options[:format] = string
21
21
  end
22
22
 
23
+ parser.on('-v', '--version', 'Print version') do
24
+ puts "UrlFinder version #{UrlFinder::VERSION}"
25
+ exit
26
+ end
27
+
23
28
  parser.on('-h', '--help', 'How to use') do
24
29
  puts parser
25
30
  exit
@@ -9,7 +9,7 @@ module UrlFinder
9
9
  # @param [String] path to file
10
10
  # @param [String] file_format
11
11
  # of file if nil file format will tried to be infered from
12
- # file extension (markdown, html, csv, string)
12
+ # file extension (markdown, html, string)
13
13
  def self.from_file(path, file_format = nil)
14
14
  file_format ||= path.split('.').last
15
15
 
@@ -18,7 +18,7 @@ module UrlFinder
18
18
 
19
19
  # Find URLs in string
20
20
  # @param [String] content string
21
- # @param [String] file_format of string (markdown, html, csv, string)
21
+ # @param [String] file_format of string (markdown, html, string)
22
22
  def self.from(content, file_format)
23
23
  Reader.new(content, file_format).urls
24
24
  end
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'url_finder/readers/csv_reader'
4
3
  require 'url_finder/readers/html_reader'
5
4
  require 'url_finder/readers/markdown_reader'
5
+ require 'url_finder/readers/sitemap_reader'
6
6
  require 'url_finder/readers/string_reader'
7
7
 
8
8
  module UrlFinder
@@ -12,7 +12,8 @@ module UrlFinder
12
12
  'markdown' => MarkdownReader,
13
13
  'md' => MarkdownReader,
14
14
  'html' => HTMLReader,
15
- 'csv' => CSVReader,
15
+ 'sitemap' => SitemapReader,
16
+ 'sitemap.xml' => SitemapReader,
16
17
  'string' => StringReader,
17
18
  'txt' => StringReader,
18
19
  }.freeze
@@ -9,8 +9,10 @@ module UrlFinder
9
9
  # Returns the found URLs
10
10
  # @return [Array<String>] the found URLs
11
11
  def urls
12
- document = Nokogiri::HTML(content)
13
- @urls ||= document.css('a').map { |e| e['href'] }.compact
12
+ @urls ||= begin
13
+ document = Nokogiri::HTML(content)
14
+ document.css('a').map { |e| e['href'] }.compact
15
+ end
14
16
  end
15
17
  end
16
18
  end
@@ -10,8 +10,10 @@ module UrlFinder
10
10
  # Returns the found URLs
11
11
  # @return [Array<String>] the found URLs
12
12
  def urls
13
- html = Kramdown::Document.new(content).to_html
14
- @urls ||= HTMLReader.new(html).urls
13
+ @urls ||= begin
14
+ html = Kramdown::Document.new(content).to_html
15
+ HTMLReader.new(html).urls
16
+ end
15
17
  end
16
18
  end
17
19
  end
@@ -0,0 +1,79 @@
1
+ require 'rexml/document'
2
+
3
+ module UrlFinder
4
+ # Parse Sitemaps, https://www.sitemaps.org
5
+ class SitemapReader < BaseReader
6
+ # Return all URLs defined in Sitemap.
7
+ # @return [Array<String>] of URLs defined in Sitemap.
8
+ # @example Get URLs defined in Sitemap
9
+ # sitemap = Sitemap.new(xml)
10
+ # sitemap.urls
11
+ def urls
12
+ @urls ||= extract_urls('url')
13
+ end
14
+
15
+ # The XML document
16
+ # @return [REXML::Document] the XML document
17
+ def document
18
+ @document ||= begin
19
+ REXML::Document.new(content)
20
+ rescue REXML::ParseException => _e
21
+ REXML::Document.new('')
22
+ end
23
+ end
24
+
25
+ # Return all sitemap URLs defined in Sitemap.
26
+ # @return [Array<String>] of Sitemap URLs defined in Sitemap.
27
+ # @example Get Sitemap URLs defined in Sitemap
28
+ # sitemap = Sitemap.new(xml)
29
+ # sitemap.sitemaps
30
+ def sitemaps
31
+ @sitemaps ||= extract_urls('sitemap')
32
+ end
33
+
34
+ # Check if sitemap is a plain file
35
+ # @return [Boolean] whether document is plain
36
+ def plain_document?
37
+ document.elements.empty?
38
+ end
39
+
40
+ # Return the name of the document (if there is one)
41
+ # @return [String] the document root name
42
+ def root_name
43
+ return unless document.root
44
+
45
+ document.root.name
46
+ end
47
+
48
+ # Returns true of Sitemap is a Sitemap index
49
+ # @return [Boolean] of whether the Sitemap is an Sitemap index or not
50
+ # @example Check if Sitemap is a sitemap index
51
+ # sitemap = Sitemap.new(xml)
52
+ # sitemap.sitemap_index?
53
+ def sitemap_index?
54
+ root_name == 'sitemapindex'
55
+ end
56
+
57
+ # Returns true of Sitemap lists regular URLs
58
+ # @return [Boolean] of whether the Sitemap regular URL list
59
+ # @example Check if Sitemap is a regular URL list
60
+ # sitemap = Sitemap.new(xml)
61
+ # sitemap.urlset?
62
+ def urlset?
63
+ root_name == 'urlset'
64
+ end
65
+
66
+ private
67
+
68
+ # Extract URLs from Sitemap
69
+ def extract_urls(node_name)
70
+ return document.to_s.each_line.map(&:strip) if plain_document?
71
+
72
+ urls = []
73
+ document.root.elements.each("#{node_name}/loc") do |element|
74
+ urls << element.text
75
+ end
76
+ urls
77
+ end
78
+ end
79
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module UrlFinder
4
4
  # Gem version
5
- VERSION = '0.1.0'.freeze
5
+ VERSION = '0.2.0'.freeze
6
6
  end
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ['Jacob Burenstam']
11
11
  spec.email = ['burenstam@gmail.com']
12
12
 
13
- spec.summary = 'Find URLs in common file formats (Markdown, HTML, CSV, string).'
14
- spec.description = 'Find URLs in common file formats (Markdown, HTML, CSV, string) with ease - Ruby and CLI.'
13
+ spec.summary = 'Find URLs in common file formats (Markdown, HTML, string).'
14
+ spec.description = 'Find URLs in common file formats (Markdown, HTML, string) with ease - Ruby and CLI.'
15
15
  spec.homepage = 'https://github.com/buren/url_finder'
16
16
  spec.license = 'MIT'
17
17
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-08-19 00:00:00.000000000 Z
11
+ date: 2018-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: kramdown
@@ -108,8 +108,8 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '3.0'
111
- description: Find URLs in common file formats (Markdown, HTML, CSV, string) with ease
112
- - Ruby and CLI.
111
+ description: Find URLs in common file formats (Markdown, HTML, string) with ease -
112
+ Ruby and CLI.
113
113
  email:
114
114
  - burenstam@gmail.com
115
115
  executables:
@@ -117,7 +117,6 @@ executables:
117
117
  extensions: []
118
118
  extra_rdoc_files: []
119
119
  files:
120
- - ".byebug_history"
121
120
  - ".gitignore"
122
121
  - ".rspec"
123
122
  - ".rubocop.yml"
@@ -133,9 +132,9 @@ files:
133
132
  - lib/url_finder.rb
134
133
  - lib/url_finder/reader.rb
135
134
  - lib/url_finder/readers/base_reader.rb
136
- - lib/url_finder/readers/csv_reader.rb
137
135
  - lib/url_finder/readers/html_reader.rb
138
136
  - lib/url_finder/readers/markdown_reader.rb
137
+ - lib/url_finder/readers/sitemap_reader.rb
139
138
  - lib/url_finder/readers/string_reader.rb
140
139
  - lib/url_finder/version.rb
141
140
  - url_finder.gemspec
@@ -162,5 +161,5 @@ rubyforge_project:
162
161
  rubygems_version: 2.7.6
163
162
  signing_key:
164
163
  specification_version: 4
165
- summary: Find URLs in common file formats (Markdown, HTML, CSV, string).
164
+ summary: Find URLs in common file formats (Markdown, HTML, string).
166
165
  test_files: []
@@ -1,3 +0,0 @@
1
- exit
2
- finder.urls.class
3
- finder.urls
@@ -1,15 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'csv'
4
- require 'url_finder/readers/base_reader'
5
-
6
- module UrlFinder
7
- # Find URLs in CSV string
8
- class CSVReader < BaseReader
9
- # Returns the found URLs
10
- # @return [Array<String>] the found URLs
11
- def urls
12
- @urls ||= CSV.parse(content).map(&:first).compact
13
- end
14
- end
15
- end