rubycrawl 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f463d9ba6ffa83c283954dd411a08dc0184ed065128f57da625b9c349447b77a
4
- data.tar.gz: cc8adb28596fe65e54f18ec97d83152c9a8df6d38c3cf584c359f2a2230e6048
3
+ metadata.gz: 06ec3c9fa168060673012766dd0d217911aeee642c3ef545e31058927a202a72
4
+ data.tar.gz: 4ef9134d6398175ffc4e2cbb9e85a1b1a0d954d7db1f90d96388aa8319692cd7
5
5
  SHA512:
6
- metadata.gz: 98c20cc8a1ff17df7a830e93f6aa49e5c630c7d43533e516648b4c4fdc301c7e733ab9aba6502d6de7bb5b5f1afe40f037d4fad59e77051322780dba5c575fa2
7
- data.tar.gz: a789dea3bfbd3c63dc8d364da49b38904675d1311dc143dacbb0cf58631a0e8d59d8b3484148dde85b6efb03f8ec3caf6026e209239486219c7f55a7c955ff5c
6
+ metadata.gz: 0bbc9345c7cbb3e45b8af62baa473c0d01b07a71c24ec0f155b3532e54ffb93a98aed685794adba94b355cc6ccdfa9785d0c3e49dcc40213b386a060a4612b26
7
+ data.tar.gz: dec9d0d9333f44a642d296fe31d1585c18a289824c327bc59f5812b6726a7fd30d5ff4fffafdf8fdb74a37a69135725597fd292df5ed26088c3e8d3d8b792fc7
@@ -11,7 +11,7 @@ class RubyCrawl
11
11
  uri = URI.parse(url)
12
12
 
13
13
  unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
14
- raise ConfigurationError, "Only HTTP(S) URLs are supported, got: #{url}"
14
+ raise ConfigurationError, "Invalid URL: Only HTTP(S) URLs are supported, got: #{url}"
15
15
  end
16
16
 
17
17
  if uri.host&.match?(/^(localhost|127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[01]))/)
@@ -1,27 +1,54 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'uri'
4
+
3
5
  class RubyCrawl
4
6
  # Converts HTML to Markdown using reverse_markdown gem.
5
7
  module MarkdownConverter
8
+ # Patterns for relative URLs in markdown
9
+ MARKDOWN_URL_PATTERNS = [
10
+ %r{(!\[[^\]]*\])\((/[^)]+)\)}, # ![alt](/path)
11
+ %r{(\[[^\]]*\])\((/[^)]+)\)} # [text](/path)
12
+ ].freeze
13
+
6
14
  module_function
7
15
 
8
- # Convert HTML to Markdown.
16
+ # Convert HTML to Markdown with resolved URLs.
9
17
  #
10
18
  # @param html [String] The HTML content to convert
19
+ # @param base_url [String, nil] Base URL to resolve relative URLs
11
20
  # @param options [Hash] Options for conversion
12
- # @option options [Boolean] :unknown_tags (:bypass) How to handle unknown tags
13
- # @option options [Boolean] :github_flavored (true) Use GitHub-flavored markdown
14
- # @return [String] The Markdown content
15
- def convert(html, options = {})
21
+ # @return [String] The Markdown content with absolute URLs
22
+ def convert(html, base_url: nil, **options)
16
23
  return '' if html.nil? || html.empty?
17
24
 
18
25
  require_reverse_markdown
19
- ReverseMarkdown.convert(html, default_options.merge(options))
26
+ markdown = ReverseMarkdown.convert(html, default_options.merge(options))
27
+ base_url ? resolve_relative_urls(markdown, base_url) : markdown
20
28
  rescue LoadError
21
29
  warn '[rubycrawl] reverse_markdown gem not installed. Add it to your Gemfile for markdown support.'
22
30
  ''
23
31
  end
24
32
 
33
+ # Resolve relative URLs in markdown to absolute URLs.
34
+ #
35
+ # @param markdown [String] The markdown content
36
+ # @param base_url [String] The base URL to resolve against
37
+ # @return [String] Markdown with absolute URLs
38
+ def resolve_relative_urls(markdown, base_url)
39
+ return markdown unless base_url
40
+
41
+ base_uri = URI.parse(base_url)
42
+ origin = "#{base_uri.scheme}://#{base_uri.host}"
43
+ origin += ":#{base_uri.port}" unless [80, 443].include?(base_uri.port)
44
+
45
+ MARKDOWN_URL_PATTERNS.reduce(markdown) do |md, pattern|
46
+ md.gsub(pattern) { "#{::Regexp.last_match(1)}(#{origin}#{::Regexp.last_match(2)})" }
47
+ end
48
+ rescue URI::InvalidURIError
49
+ markdown
50
+ end
51
+
25
52
  def require_reverse_markdown
26
53
  require 'reverse_markdown'
27
54
  end
@@ -14,10 +14,18 @@ class RubyCrawl
14
14
  end
15
15
 
16
16
  # Returns markdown, converting from HTML lazily if needed.
17
+ # Relative URLs are resolved using the page's final_url.
17
18
  #
18
- # @return [String] Markdown content
19
+ # @return [String] Markdown content with absolute URLs
19
20
  def markdown
20
- @markdown ||= MarkdownConverter.convert(html)
21
+ @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
22
+ end
23
+
24
+ # The final URL after redirects.
25
+ #
26
+ # @return [String, nil]
27
+ def final_url
28
+ metadata['final_url'] || metadata[:final_url]
21
29
  end
22
30
 
23
31
  # Check if markdown has been computed.
@@ -17,9 +17,14 @@ class RubyCrawl
17
17
  @depth = depth
18
18
  end
19
19
 
20
- # Lazy markdown conversion.
20
+ # Lazy markdown conversion with resolved URLs.
21
21
  def markdown
22
- @markdown ||= MarkdownConverter.convert(html)
22
+ @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
23
+ end
24
+
25
+ # The final URL after redirects.
26
+ def final_url
27
+ metadata['final_url'] || metadata[:final_url] || url
23
28
  end
24
29
  end
25
30
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RubyCrawl
4
- VERSION = '0.1.0'
4
+ VERSION = '0.1.2'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - RubyCrawl contributors