rubycrawl 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rubycrawl/helpers.rb +1 -1
- data/lib/rubycrawl/markdown_converter.rb +33 -6
- data/lib/rubycrawl/result.rb +10 -2
- data/lib/rubycrawl/site_crawler.rb +7 -2
- data/lib/rubycrawl/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 06ec3c9fa168060673012766dd0d217911aeee642c3ef545e31058927a202a72
|
|
4
|
+
data.tar.gz: 4ef9134d6398175ffc4e2cbb9e85a1b1a0d954d7db1f90d96388aa8319692cd7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0bbc9345c7cbb3e45b8af62baa473c0d01b07a71c24ec0f155b3532e54ffb93a98aed685794adba94b355cc6ccdfa9785d0c3e49dcc40213b386a060a4612b26
|
|
7
|
+
data.tar.gz: dec9d0d9333f44a642d296fe31d1585c18a289824c327bc59f5812b6726a7fd30d5ff4fffafdf8fdb74a37a69135725597fd292df5ed26088c3e8d3d8b792fc7
|
data/lib/rubycrawl/helpers.rb
CHANGED
|
@@ -11,7 +11,7 @@ class RubyCrawl
|
|
|
11
11
|
uri = URI.parse(url)
|
|
12
12
|
|
|
13
13
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
14
|
-
raise ConfigurationError, "Only HTTP(S) URLs are supported, got: #{url}"
|
|
14
|
+
raise ConfigurationError, "Invalid URL: Only HTTP(S) URLs are supported, got: #{url}"
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
if uri.host&.match?(/^(localhost|127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[01]))/)
|
|
@@ -1,27 +1,54 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
3
5
|
class RubyCrawl
|
|
4
6
|
# Converts HTML to Markdown using reverse_markdown gem.
|
|
5
7
|
module MarkdownConverter
|
|
8
|
+
# Patterns for relative URLs in markdown
|
|
9
|
+
MARKDOWN_URL_PATTERNS = [
|
|
10
|
+
%r{(!\[[^\]]*\])\((/[^)]+)\)}, # 
|
|
11
|
+
%r{(\[[^\]]*\])\((/[^)]+)\)} # [text](/path)
|
|
12
|
+
].freeze
|
|
13
|
+
|
|
6
14
|
module_function
|
|
7
15
|
|
|
8
|
-
# Convert HTML to Markdown.
|
|
16
|
+
# Convert HTML to Markdown with resolved URLs.
|
|
9
17
|
#
|
|
10
18
|
# @param html [String] The HTML content to convert
|
|
19
|
+
# @param base_url [String, nil] Base URL to resolve relative URLs
|
|
11
20
|
# @param options [Hash] Options for conversion
|
|
12
|
-
# @
|
|
13
|
-
|
|
14
|
-
# @return [String] The Markdown content
|
|
15
|
-
def convert(html, options = {})
|
|
21
|
+
# @return [String] The Markdown content with absolute URLs
|
|
22
|
+
def convert(html, base_url: nil, **options)
|
|
16
23
|
return '' if html.nil? || html.empty?
|
|
17
24
|
|
|
18
25
|
require_reverse_markdown
|
|
19
|
-
ReverseMarkdown.convert(html, default_options.merge(options))
|
|
26
|
+
markdown = ReverseMarkdown.convert(html, default_options.merge(options))
|
|
27
|
+
base_url ? resolve_relative_urls(markdown, base_url) : markdown
|
|
20
28
|
rescue LoadError
|
|
21
29
|
warn '[rubycrawl] reverse_markdown gem not installed. Add it to your Gemfile for markdown support.'
|
|
22
30
|
''
|
|
23
31
|
end
|
|
24
32
|
|
|
33
|
+
# Resolve relative URLs in markdown to absolute URLs.
|
|
34
|
+
#
|
|
35
|
+
# @param markdown [String] The markdown content
|
|
36
|
+
# @param base_url [String] The base URL to resolve against
|
|
37
|
+
# @return [String] Markdown with absolute URLs
|
|
38
|
+
def resolve_relative_urls(markdown, base_url)
|
|
39
|
+
return markdown unless base_url
|
|
40
|
+
|
|
41
|
+
base_uri = URI.parse(base_url)
|
|
42
|
+
origin = "#{base_uri.scheme}://#{base_uri.host}"
|
|
43
|
+
origin += ":#{base_uri.port}" unless [80, 443].include?(base_uri.port)
|
|
44
|
+
|
|
45
|
+
MARKDOWN_URL_PATTERNS.reduce(markdown) do |md, pattern|
|
|
46
|
+
md.gsub(pattern) { "#{::Regexp.last_match(1)}(#{origin}#{::Regexp.last_match(2)})" }
|
|
47
|
+
end
|
|
48
|
+
rescue URI::InvalidURIError
|
|
49
|
+
markdown
|
|
50
|
+
end
|
|
51
|
+
|
|
25
52
|
def require_reverse_markdown
|
|
26
53
|
require 'reverse_markdown'
|
|
27
54
|
end
|
data/lib/rubycrawl/result.rb
CHANGED
|
@@ -14,10 +14,18 @@ class RubyCrawl
|
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
# Returns markdown, converting from HTML lazily if needed.
|
|
17
|
+
# Relative URLs are resolved using the page's final_url.
|
|
17
18
|
#
|
|
18
|
-
# @return [String] Markdown content
|
|
19
|
+
# @return [String] Markdown content with absolute URLs
|
|
19
20
|
def markdown
|
|
20
|
-
@markdown ||= MarkdownConverter.convert(html)
|
|
21
|
+
@markdown ||= MarkdownConverter.convert(html, base_url: final_url)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# The final URL after redirects.
|
|
25
|
+
#
|
|
26
|
+
# @return [String, nil]
|
|
27
|
+
def final_url
|
|
28
|
+
metadata['final_url'] || metadata[:final_url]
|
|
21
29
|
end
|
|
22
30
|
|
|
23
31
|
# Check if markdown has been computed.
|
|
@@ -17,9 +17,14 @@ class RubyCrawl
|
|
|
17
17
|
@depth = depth
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
# Lazy markdown conversion.
|
|
20
|
+
# Lazy markdown conversion with resolved URLs.
|
|
21
21
|
def markdown
|
|
22
|
-
@markdown ||= MarkdownConverter.convert(html)
|
|
22
|
+
@markdown ||= MarkdownConverter.convert(html, base_url: final_url)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# The final URL after redirects.
|
|
26
|
+
def final_url
|
|
27
|
+
metadata['final_url'] || metadata[:final_url] || url
|
|
23
28
|
end
|
|
24
29
|
end
|
|
25
30
|
|
data/lib/rubycrawl/version.rb
CHANGED