url_finder 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +5 -5
- data/exe/url_finder +6 -1
- data/lib/url_finder.rb +2 -2
- data/lib/url_finder/reader.rb +3 -2
- data/lib/url_finder/readers/html_reader.rb +4 -2
- data/lib/url_finder/readers/markdown_reader.rb +4 -2
- data/lib/url_finder/readers/sitemap_reader.rb +79 -0
- data/lib/url_finder/version.rb +1 -1
- data/url_finder.gemspec +2 -2
- metadata +6 -7
- data/.byebug_history +0 -3
- data/lib/url_finder/readers/csv_reader.rb +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb4f68c79b917473804be0fdc5ff776f3f816571b9aafb7358fd746211af05c0
|
4
|
+
data.tar.gz: d17bbf2b35810cfa9d462a9a0a708df565d71868187b20b0735cd55b1d01112e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74b27c7b82404d569a56d141351c4712f4dc112033ce63707bbf3dbd35617526137a002e52a485d2f9673679b62b6fa20134c6c09bf7e504e2432c08a5e98d71
|
7
|
+
data.tar.gz: 42e499f66152503668664e9051c9b59543bc65dcd260fbfc78f23a762db0a9229803e3468e2839d7d197cb61fde2d171c4b7adcaae0e9eeb4d18b4de923bc144
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# UrlFinder
|
2
2
|
|
3
|
-
Find URLs in various file formats - supports markdown, HTML
|
3
|
+
Find URLs in various file formats - supports markdown, HTML and regular text.
|
4
4
|
|
5
5
|
- [Usage](#usage)
|
6
6
|
- [CLI](#cli)
|
@@ -39,7 +39,7 @@ UrlFinder.from_file('README', 'md').each do |url|
|
|
39
39
|
end
|
40
40
|
```
|
41
41
|
|
42
|
-
Supported formats are `markdown` (aliased as `md`), `html
|
42
|
+
Supported formats are `markdown` (aliased as `md`), `html` and `string`.
|
43
43
|
|
44
44
|
Find URLs in string
|
45
45
|
```ruby
|
@@ -54,15 +54,15 @@ end
|
|
54
54
|
```
|
55
55
|
Usage: url_finder --help
|
56
56
|
--file=/path/to/file Input file
|
57
|
-
--format=file_format Input format (html, markdown,
|
57
|
+
--format=file_format Input format (html, markdown, sitemap.xml, string)
|
58
58
|
-h, --help How to use
|
59
59
|
```
|
60
60
|
|
61
61
|
## Wish list
|
62
62
|
|
63
|
-
- Better CSV support
|
64
|
-
+ Current support is extremely crude and makes a lot of assumptions about the file..
|
65
63
|
- RDoc support
|
64
|
+
- Sitemap.xml support
|
65
|
+
- and..?
|
66
66
|
|
67
67
|
## Development
|
68
68
|
|
data/exe/url_finder
CHANGED
@@ -16,10 +16,15 @@ OptionParser.new do |parser|
|
|
16
16
|
options[:file] = string
|
17
17
|
end
|
18
18
|
|
19
|
-
parser.on('--format=file_format', String, 'Input format (html, markdown,
|
19
|
+
parser.on('--format=file_format', String, 'Input format (html, markdown, sitemap.xml, string)') do |string|
|
20
20
|
options[:format] = string
|
21
21
|
end
|
22
22
|
|
23
|
+
parser.on('-v', '--version', 'Print version') do
|
24
|
+
puts "UrlFinder version #{UrlFinder::VERSION}"
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
23
28
|
parser.on('-h', '--help', 'How to use') do
|
24
29
|
puts parser
|
25
30
|
exit
|
data/lib/url_finder.rb
CHANGED
@@ -9,7 +9,7 @@ module UrlFinder
|
|
9
9
|
# @param [String] path to file
|
10
10
|
# @param [String] file_format
|
11
11
|
# of file if nil file format will tried to be infered from
|
12
|
-
# file extension (markdown, html,
|
12
|
+
# file extension (markdown, html, string)
|
13
13
|
def self.from_file(path, file_format = nil)
|
14
14
|
file_format ||= path.split('.').last
|
15
15
|
|
@@ -18,7 +18,7 @@ module UrlFinder
|
|
18
18
|
|
19
19
|
# Find URLs in string
|
20
20
|
# @param [String] content string
|
21
|
-
# @param [String] file_format of string (markdown, html,
|
21
|
+
# @param [String] file_format of string (markdown, html, string)
|
22
22
|
def self.from(content, file_format)
|
23
23
|
Reader.new(content, file_format).urls
|
24
24
|
end
|
data/lib/url_finder/reader.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'url_finder/readers/csv_reader'
|
4
3
|
require 'url_finder/readers/html_reader'
|
5
4
|
require 'url_finder/readers/markdown_reader'
|
5
|
+
require 'url_finder/readers/sitemap_reader'
|
6
6
|
require 'url_finder/readers/string_reader'
|
7
7
|
|
8
8
|
module UrlFinder
|
@@ -12,7 +12,8 @@ module UrlFinder
|
|
12
12
|
'markdown' => MarkdownReader,
|
13
13
|
'md' => MarkdownReader,
|
14
14
|
'html' => HTMLReader,
|
15
|
-
'
|
15
|
+
'sitemap' => SitemapReader,
|
16
|
+
'sitemap.xml' => SitemapReader,
|
16
17
|
'string' => StringReader,
|
17
18
|
'txt' => StringReader,
|
18
19
|
}.freeze
|
@@ -9,8 +9,10 @@ module UrlFinder
|
|
9
9
|
# Returns the found URLs
|
10
10
|
# @return [Array<String>] the found URLs
|
11
11
|
def urls
|
12
|
-
|
13
|
-
|
12
|
+
@urls ||= begin
|
13
|
+
document = Nokogiri::HTML(content)
|
14
|
+
document.css('a').map { |e| e['href'] }.compact
|
15
|
+
end
|
14
16
|
end
|
15
17
|
end
|
16
18
|
end
|
@@ -10,8 +10,10 @@ module UrlFinder
|
|
10
10
|
# Returns the found URLs
|
11
11
|
# @return [Array<String>] the found URLs
|
12
12
|
def urls
|
13
|
-
|
14
|
-
|
13
|
+
@urls ||= begin
|
14
|
+
html = Kramdown::Document.new(content).to_html
|
15
|
+
HTMLReader.new(html).urls
|
16
|
+
end
|
15
17
|
end
|
16
18
|
end
|
17
19
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module UrlFinder
|
4
|
+
# Parse Sitemaps, https://www.sitemaps.org
|
5
|
+
class SitemapReader < BaseReader
|
6
|
+
# Return all URLs defined in Sitemap.
|
7
|
+
# @return [Array<String>] of URLs defined in Sitemap.
|
8
|
+
# @example Get URLs defined in Sitemap
|
9
|
+
# sitemap = Sitemap.new(xml)
|
10
|
+
# sitemap.urls
|
11
|
+
def urls
|
12
|
+
@urls ||= extract_urls('url')
|
13
|
+
end
|
14
|
+
|
15
|
+
# The XML document
|
16
|
+
# @return [REXML::Document] the XML document
|
17
|
+
def document
|
18
|
+
@document ||= begin
|
19
|
+
REXML::Document.new(content)
|
20
|
+
rescue REXML::ParseException => _e
|
21
|
+
REXML::Document.new('')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return all sitemap URLs defined in Sitemap.
|
26
|
+
# @return [Array<String>] of Sitemap URLs defined in Sitemap.
|
27
|
+
# @example Get Sitemap URLs defined in Sitemap
|
28
|
+
# sitemap = Sitemap.new(xml)
|
29
|
+
# sitemap.sitemaps
|
30
|
+
def sitemaps
|
31
|
+
@sitemaps ||= extract_urls('sitemap')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Check if sitemap is a plain file
|
35
|
+
# @return [Boolean] whether document is plain
|
36
|
+
def plain_document?
|
37
|
+
document.elements.empty?
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the name of the document (if there is one)
|
41
|
+
# @return [String] the document root name
|
42
|
+
def root_name
|
43
|
+
return unless document.root
|
44
|
+
|
45
|
+
document.root.name
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true of Sitemap is a Sitemap index
|
49
|
+
# @return [Boolean] of whether the Sitemap is an Sitemap index or not
|
50
|
+
# @example Check if Sitemap is a sitemap index
|
51
|
+
# sitemap = Sitemap.new(xml)
|
52
|
+
# sitemap.sitemap_index?
|
53
|
+
def sitemap_index?
|
54
|
+
root_name == 'sitemapindex'
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns true of Sitemap lists regular URLs
|
58
|
+
# @return [Boolean] of whether the Sitemap regular URL list
|
59
|
+
# @example Check if Sitemap is a regular URL list
|
60
|
+
# sitemap = Sitemap.new(xml)
|
61
|
+
# sitemap.urlset?
|
62
|
+
def urlset?
|
63
|
+
root_name == 'urlset'
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
# Extract URLs from Sitemap
|
69
|
+
def extract_urls(node_name)
|
70
|
+
return document.to_s.each_line.map(&:strip) if plain_document?
|
71
|
+
|
72
|
+
urls = []
|
73
|
+
document.root.elements.each("#{node_name}/loc") do |element|
|
74
|
+
urls << element.text
|
75
|
+
end
|
76
|
+
urls
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/url_finder/version.rb
CHANGED
data/url_finder.gemspec
CHANGED
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ['Jacob Burenstam']
|
11
11
|
spec.email = ['burenstam@gmail.com']
|
12
12
|
|
13
|
-
spec.summary = 'Find URLs in common file formats (Markdown, HTML,
|
14
|
-
spec.description = 'Find URLs in common file formats (Markdown, HTML,
|
13
|
+
spec.summary = 'Find URLs in common file formats (Markdown, HTML, string).'
|
14
|
+
spec.description = 'Find URLs in common file formats (Markdown, HTML, string) with ease - Ruby and CLI.'
|
15
15
|
spec.homepage = 'https://github.com/buren/url_finder'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: kramdown
|
@@ -108,8 +108,8 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '3.0'
|
111
|
-
description: Find URLs in common file formats (Markdown, HTML,
|
112
|
-
|
111
|
+
description: Find URLs in common file formats (Markdown, HTML, string) with ease -
|
112
|
+
Ruby and CLI.
|
113
113
|
email:
|
114
114
|
- burenstam@gmail.com
|
115
115
|
executables:
|
@@ -117,7 +117,6 @@ executables:
|
|
117
117
|
extensions: []
|
118
118
|
extra_rdoc_files: []
|
119
119
|
files:
|
120
|
-
- ".byebug_history"
|
121
120
|
- ".gitignore"
|
122
121
|
- ".rspec"
|
123
122
|
- ".rubocop.yml"
|
@@ -133,9 +132,9 @@ files:
|
|
133
132
|
- lib/url_finder.rb
|
134
133
|
- lib/url_finder/reader.rb
|
135
134
|
- lib/url_finder/readers/base_reader.rb
|
136
|
-
- lib/url_finder/readers/csv_reader.rb
|
137
135
|
- lib/url_finder/readers/html_reader.rb
|
138
136
|
- lib/url_finder/readers/markdown_reader.rb
|
137
|
+
- lib/url_finder/readers/sitemap_reader.rb
|
139
138
|
- lib/url_finder/readers/string_reader.rb
|
140
139
|
- lib/url_finder/version.rb
|
141
140
|
- url_finder.gemspec
|
@@ -162,5 +161,5 @@ rubyforge_project:
|
|
162
161
|
rubygems_version: 2.7.6
|
163
162
|
signing_key:
|
164
163
|
specification_version: 4
|
165
|
-
summary: Find URLs in common file formats (Markdown, HTML,
|
164
|
+
summary: Find URLs in common file formats (Markdown, HTML, string).
|
166
165
|
test_files: []
|
data/.byebug_history
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'csv'
|
4
|
-
require 'url_finder/readers/base_reader'
|
5
|
-
|
6
|
-
module UrlFinder
|
7
|
-
# Find URLs in CSV string
|
8
|
-
class CSVReader < BaseReader
|
9
|
-
# Returns the found URLs
|
10
|
-
# @return [Array<String>] the found URLs
|
11
|
-
def urls
|
12
|
-
@urls ||= CSV.parse(content).map(&:first).compact
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|