RubyGems - nous - Versions diffs - 0.2.0 → 0.3.0 - Mend

nous 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/README.md +6 -3
data/lib/nous/cli.rb +13 -10
data/lib/nous/command.rb +2 -2
data/lib/nous/configuration_builder.rb +56 -0
data/lib/nous/converter.rb +1 -1
data/lib/nous/crawler/{page_fetcher.rb → async_page_fetcher.rb} +10 -6
data/lib/nous/crawler/link_extractor.rb +11 -11
data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
data/lib/nous/crawler/redirect_follower.rb +60 -0
data/lib/nous/crawler/single_page_fetcher.rb +72 -0
data/lib/nous/crawler/url_filter.rb +6 -6
data/lib/nous/crawler.rb +15 -70
data/lib/nous/extractor/default/client.rb +50 -0
data/lib/nous/extractor/default.rb +10 -6
data/lib/nous/extractor/jina/client.rb +4 -4
data/lib/nous/extractor/jina.rb +10 -9
data/lib/nous/fetcher/extraction_runner.rb +31 -0
data/lib/nous/fetcher/page_extractor.rb +34 -0
data/lib/nous/fetcher.rb +7 -6
data/lib/nous/primitives/configuration.rb +16 -0
data/lib/nous/primitives/extracted_content.rb +5 -0
data/lib/nous/primitives/raw_page.rb +5 -0
data/lib/nous/primitives/url.rb +45 -0
data/lib/nous/serializer.rb +5 -2
data/lib/nous/url_resolver.rb +25 -0
data/lib/nous/version.rb +1 -1
data/lib/nous.rb +6 -5
metadata +43 -8
data/lib/nous/configuration.rb +0 -39
data/lib/nous/error.rb +0 -5
data/lib/nous/extraction_runner.rb +0 -29
data/lib/nous/extraction_thread.rb +0 -28
data/lib/nous/extractor.rb +0 -46
/data/lib/nous/{page.rb → primitives/page.rb} +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c44bdc52070c6430739f9b0258ea53e3dafc1cff42d87814fd940c2e9e26ee94
-  data.tar.gz: e4b42ca9917d7e4656f8e8bc2d9b8b328781021c2ed02e9e1912bdb9ce8ac744
+  metadata.gz: 7636d207654dbf38a64aeec480164c0e57b3c8bf98ac8373e576f692896fb3a3
+  data.tar.gz: 62ae3b01ec837d71caf104710c42bde82df6d50e6c7acc50252f2902ef9b2046
 SHA512:
-  metadata.gz: f55c5122dd9a53611c7045e648c34870f9e423afae6777d0004f0bc909c0b916fd5a8a0350168d286e2e63339be6ae393f9ee02cbe4703d1a392fceaee317fd0
-  data.tar.gz: fb6bdb6b9c283bc8350a4e697412869e9c1062af0659ad5b56aee6a0cdcad33983f8a15da9a94f3ae37b45b469a5d67a1c5e977d3d2c8b27a59e8f66eeedd59c
+  metadata.gz: af52f527a8720d46cd00f3a42814d432730d105aed05ebb2435f1546afb2140bd88fd1e2c6f4e75c0226afd5ef6c9072c6919518bae366047eb022f24b30ffcd
+  data.tar.gz: '049b133f406f694771617c34d3adfef2aa64ef3aa5608d89d98230b2f596e4cdef831e53f9ee039aa247a6c588900fc51cfd07a7b456dec3ee524e83abe08b93'

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,21 @@
 ## [Unreleased]
+## [0.3.0] - 2026-02-23
+- Remove `Nous::Error` base hierarchy; colocated errors inherit directly from `StandardError` with descriptive names
+- Move extraction pipeline under `Nous::Fetcher::*` namespace (`ExtractionRunner`, `ExtractionThread`)
+- Move readability command into `Nous::Extractor::Default::Client`, mirroring Jina structure
+- `Nous::Extractor` is now a module namespace (implicit via Zeitwerk), no longer a Command
+- Shared `Extractor::ExtractionError` contract: all extractor backends raise this on failure
+- Pull `seed_url` off `Configuration`; `Crawler` owns URL parsing and validation directly
+- Explicit rescue lists in CLI and extraction thread instead of broad `Nous::Error` rescue
+- Rename `--verbose`/`-v` to `--debug`/`-d`; `-v` is now `--version`
+- Add `Nous::Url`, `Nous::UrlResolver`, and `Crawler::RedirectFollower` to correctly handle redirects and path encoding (including spaces)
+- Add `-r`/`--recursive`; default mode now fetches only the seed page unless recursion is explicitly enabled
+- Split crawler fetchers by mode: `Crawler::AsyncPageFetcher`, `Crawler::RecursivePageFetcher`, and `Crawler::SinglePageFetcher`
+- Move configuration construction to `ConfigurationBuilder` and `Data.define`-based `Configuration` primitive
+- Add `faraday-follow_redirects` for single-page redirect handling and update integration/spec coverage for recursive and single-page flows
 ## [0.2.0] - 2026-02-21
 - Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)

data/README.md CHANGED Viewed

@@ -42,8 +42,8 @@ nous https://example.com -s "article.post"
 # Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
 nous https://example.com --jina
-# Verbose logging
-nous https://example.com -v
+# Debug logging
+nous https://example.com -d
 ```
 ### Options
@@ -58,7 +58,9 @@ nous https://example.com -v
 | `-l`, `--limit N` | Maximum pages to fetch | `100` |
 | `--timeout N` | Per-request timeout in seconds | `15` |
 | `--jina` | Use Jina Reader API for extraction | off |
-| `-v`, `--verbose` | Verbose logging to stderr | off |
+| `-v`, `--version` | Print version and exit | off |
+| `-h`, `--help` | Print usage and exit | off |
+| `-d`, `--debug` | Debug logging to stderr | off |
 ## Ruby API
@@ -134,6 +136,7 @@ Extracted markdown content...
 bin/setup               # Install dependencies
 bundle exec rspec       # Run tests
 bundle exec standardrb  # Lint
+bundle exec exe/nous    # Run the command line in-development
 ```
 ## License

data/lib/nous/cli.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require "optparse"
 module Nous
   class Cli
-    class Error < Nous::Error; end
+    class CliError < StandardError; end
     def initialize(argv)
       @argv = argv
@@ -18,7 +18,9 @@ module Nous
       pages = Nous.fetch(seed_url, **fetch_options)
       output = Nous.serialize(pages, format: options[:format])
       write_output(output)
-    rescue Nous::Error => e
+    rescue CliError,
+      Fetcher::FetchError,
+      Serializer::SerializationError => e
       warn("nous: #{e.message}")
       exit 1
     end
@@ -32,7 +34,7 @@ module Nous
     end
     def fetch_options
-      opts = options.slice(:concurrency, :match, :limit, :timeout, :verbose)
+      opts = options.slice(*Configuration.members)
       opts[:extractor] = extractor
       opts
     end
@@ -44,7 +46,7 @@ module Nous
     end
     def validate!
-      raise Error, "no URL provided. Usage: nous <url> [options]" unless seed_url
+      raise CliError, "no URL provided. Usage: nous <url> [options]" unless seed_url
     end
     def write_output(output)
@@ -58,7 +60,7 @@ module Nous
     def parse_options!
       parser.parse!(argv)
     rescue OptionParser::InvalidOption => e
-      raise Error, e.message
+      raise CliError, e.message
     end
     def parser
@@ -77,13 +79,14 @@ module Nous
         opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
         opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
         opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
-        opts.on("-v", "--verbose", "Verbose logging to stderr") { options[:verbose] = true }
-        opts.on("-h", "--help", "Show help") do
-          $stdout.puts(opts)
+        opts.on("-r", "--recursive", "Follow same-host links recursively") { options[:recursive] = true }
+        opts.on("-d", "--debug", "Debug logging to stderr") { options[:debug] = true }
+        opts.on("-v", "--version", "Show version") do
+          $stdout.puts("nous #{Nous::VERSION}")
           exit
         end
-        opts.on("--version", "Show version") do
-          $stdout.puts("nous #{Nous::VERSION}")
+        opts.on("-h", "--help", "Show help") do
+          $stdout.puts(opts)
           exit
         end
       end

data/lib/nous/command.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Nous
   class Command
-    class Error < Nous::Error; end
+    class CommandError < StandardError; end
     class Result
       attr_reader :payload, :error, :metadata
@@ -27,7 +27,7 @@ module Nous
       command = new(...)
       command.call
     rescue => e
-      return command.failure(Error.new("unexpected: #{e.message}")) if command
+      return command.failure(CommandError.new("unexpected: #{e.message}")) if command
       Result.new(success: false, error: e)
     end

data/lib/nous/configuration_builder.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+module Nous
+  class ConfigurationBuilder
+    class UnknownOptionError < StandardError; end
+    DEFAULTS = {
+      concurrency: 3,
+      match: [],
+      limit: 100,
+      timeout: 15,
+      debug: false,
+      keep_query: false,
+      recursive: false
+    }.freeze
+    def self.call(**options)
+      new(options).call
+    end
+    def initialize(options)
+      @options = options
+    end
+    def call
+      validate_keys!
+      Configuration.new(**coerced_options)
+    end
+    private
+    attr_reader :options
+    def validate_keys!
+      unknown = options.keys - Configuration.members
+      return if unknown.empty?
+      raise UnknownOptionError, "unknown option(s): #{unknown.join(", ")}"
+    end
+    def coerced_options
+      merged = DEFAULTS.merge(options)
+      {
+        concurrency: Integer(merged[:concurrency]).clamp(1, 20),
+        match: Array(merged[:match]),
+        limit: Integer(merged[:limit]).clamp(1, 10_000),
+        timeout: Integer(merged[:timeout]),
+        debug: !!merged[:debug],
+        keep_query: !!merged[:keep_query],
+        recursive: !!merged[:recursive]
+      }
+    end
+  end
+end

data/lib/nous/converter.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require "reverse_markdown"
 module Nous
   class Converter < Command
-    class Error < Command::Error; end
+    class ConversionError < StandardError; end
     def initialize(html:)
       @html = html

data/lib/nous/crawler/{page_fetcher.rb → async_page_fetcher.rb} RENAMED Viewed

@@ -2,20 +2,24 @@
 module Nous
   class Crawler < Command
-    class PageFetcher
+    class AsyncPageFetcher
       HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
-      def initialize(client:)
+      def initialize(client:, seed_host:)
         @client = client
+        @seed_host = seed_host
       end
       def fetch(url)
         Async::Task.current.with_timeout(config.timeout) do
-          response = client.get(url, {})
+          result = RedirectFollower.call(client:, seed_host:, url:)
+          return skip(url, result.error.message) if result.failure?
+          response, final_url = result.payload
           return skip(url, "status #{response.status}") unless response.status == 200
           return skip(url, "non-html content") unless html?(response)
-          {url:, pathname: URI.parse(url).path, html: response.read}
+          RawPage.new(url: final_url.to_s, pathname: final_url.path, html: response.read)
         ensure
           response&.close
         end
@@ -27,7 +31,7 @@ module Nous
       private
-      attr_reader :client
+      attr_reader :client, :seed_host
       def config
         Nous.configuration
@@ -39,7 +43,7 @@ module Nous
       end
       def skip(url, reason)
-        warn("[nous] skip #{url}: #{reason}") if config.verbose?
+        warn("[nous] skip #{url}: #{reason}") if config.debug?
         nil
       end
     end

data/lib/nous/crawler/link_extractor.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require "nokogiri"
 module Nous
   class Crawler < Command
     class LinkExtractor
@@ -8,9 +10,7 @@ module Nous
       end
       def extract(current_url, html)
-        base_uri = URI.parse(current_url)
-        anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
+        anchors(html).filter_map { |href| resolve(current_url, href) }.uniq
       end
       private
@@ -21,19 +21,19 @@ module Nous
         Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
       end
-      def resolve(base_uri, href)
+      def resolve(current_url, href)
         return unless url_filter.allowed?(href)
-        uri = URI.join(base_uri, href)
-        return unless url_filter.same_host?(uri)
+        result = UrlResolver.call(base_url: current_url, href:)
+        return unless result.success?
+        url = result.payload
+        return unless url_filter.same_host?(url)
-        canonical = url_filter.canonicalize(uri)
-        return unless url_filter.matches_path?(URI.parse(canonical).path)
+        canonical = url_filter.canonicalize(url)
+        return unless url_filter.matches_path?(Url.new(canonical).path)
         canonical
-      rescue URI::InvalidURIError => e
-        warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
-        nil
       end
     end
   end

data/lib/nous/crawler/recursive_page_fetcher.rb ADDED Viewed

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+require "async"
+require "async/http/internet"
+module Nous
+  class Crawler < Command
+    class RecursivePageFetcher < Command
+      def initialize(seed_url:, http_client: nil)
+        @seed_uri = Url.new(seed_url)
+        @http_client = http_client
+        @pages = []
+        @queue = [url_filter.canonicalize(seed_uri)]
+        @seen = Set.new(queue)
+      end
+      def call
+        suppress_async_warnings unless config.debug?
+        open_connection do |client|
+          crawl(client)
+        end
+        success(payload: pages)
+      end
+      private
+      attr_reader :seed_uri, :http_client, :pages, :queue, :seen
+      def config
+        Nous.configuration
+      end
+      def crawl(client)
+        fetch_and_enqueue(queue.shift(config.concurrency), client) while queue.any? && within_limit?
+      end
+      def fetch_and_enqueue(batch, client)
+        fetch_batch(batch, client).each do |page|
+          next unless page
+          break unless within_limit?
+          pages << page
+          seen << page.url
+          enqueue_links(page)
+        end
+      end
+      def fetch_batch(urls, client)
+        tasks = []
+        Async do |task|
+          urls.each do |url|
+            tasks << task.async { page_fetcher(client).fetch(url) }
+          end
+        end.wait
+        tasks.map(&:wait)
+      end
+      def enqueue_links(page)
+        link_extractor.extract(page.url, page.html).each do |url|
+          next if seen.include?(url)
+          seen << url
+          queue << url
+        end
+      end
+      def within_limit?
+        pages.length < config.limit
+      end
+      def open_connection
+        client = http_client || Async::HTTP::Internet.new
+        Async do
+          yield client
+        ensure
+          client.close
+        end.wait
+      end
+      def page_fetcher(client)
+        AsyncPageFetcher.new(client:, seed_host: seed_uri.host)
+      end
+      def url_filter
+        @url_filter ||= UrlFilter.new(seed_uri:)
+      end
+      def link_extractor
+        @link_extractor ||= LinkExtractor.new(url_filter:)
+      end
+      def suppress_async_warnings
+        require "console"
+        Console.logger.level = :error
+      end
+    end
+  end
+end

data/lib/nous/crawler/redirect_follower.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+module Nous
+  class Crawler < Command
+    class RedirectFollower < Command
+      class RedirectError < StandardError; end
+      MAX_HOPS = 5
+      def initialize(client:, seed_host:, url:, hops_remaining: MAX_HOPS)
+        @client = client
+        @seed_host = seed_host
+        @url = url
+        @hops_remaining = hops_remaining
+      end
+      def call
+        response = client.get(url, {})
+        return success(payload: [response, Url.new(url)]) unless redirect?(response.status)
+        response.close
+        follow(response.headers["location"])
+      end
+      private
+      attr_reader :client, :seed_host, :url, :hops_remaining
+      def redirect?(status)
+        (300..399).cover?(status)
+      end
+      def follow(location)
+        target = resolve_target(location)
+        return target if target.failure?
+        self.class.call(client:, seed_host:, url: target.payload.to_s, hops_remaining: hops_remaining - 1)
+      end
+      def resolve_target(location)
+        return failure(RedirectError.new("redirect without location from #{url}")) unless location
+        return failure(RedirectError.new("too many redirects from #{url}")) if hops_remaining <= 0
+        result = UrlResolver.call(base_url: url, href: location)
+        return failure(RedirectError.new(result.error.message)) if result.failure?
+        unless safe?(result.payload)
+          return failure(RedirectError.new("redirect to #{result.payload} outside #{seed_host}"))
+        end
+        result
+      end
+      def safe?(target)
+        target.http? && target.host == seed_host
+      end
+    end
+  end
+end

data/lib/nous/crawler/single_page_fetcher.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+require "faraday"
+require "faraday/follow_redirects"
+module Nous
+  class Crawler < Command
+    class SinglePageFetcher < Command
+      class FetchError < StandardError; end
+      HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
+      MAX_REDIRECTS = 5
+      def initialize(url:, http_client: nil)
+        @url = url
+        @seed_host = Url.new(url).host
+        @connection = http_client || build_connection
+      end
+      def call
+        response = connection.get(url)
+        final_url = resolve_final_url(response)
+        validate_host!(final_url)
+        validate_html!(response)
+        raw_page = RawPage.new(url: final_url.to_s, pathname: final_url.path, html: response.body)
+        success(payload: [raw_page])
+      rescue FetchError => e
+        failure(e)
+      rescue Faraday::Error => e
+        failure(FetchError.new(e.message))
+      end
+      private
+      attr_reader :url, :seed_host, :connection
+      def config
+        Nous.configuration
+      end
+      def resolve_final_url(response)
+        location = response.env.url.to_s
+        Url.new(location)
+      end
+      def validate_host!(final_url)
+        return if final_url.host == seed_host
+        raise FetchError, "redirected to #{final_url} outside #{seed_host}"
+      end
+      def validate_html!(response)
+        content_type = response.headers["content-type"].to_s
+        return if HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
+        raise FetchError, "non-html content: #{content_type}"
+      end
+      def build_connection
+        Faraday.new do |f|
+          f.response :follow_redirects, limit: MAX_REDIRECTS
+          f.response :raise_error
+          f.options.timeout = config.timeout
+          f.options.open_timeout = config.timeout
+        end
+      end
+    end
+  end
+end

data/lib/nous/crawler/url_filter.rb CHANGED Viewed

@@ -5,10 +5,10 @@ module Nous
     class UrlFilter
       IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
-      def initialize(config)
-        @host = config.seed.host
-        @match = config.match
-        @keep_query = config.keep_query
+      def initialize(seed_uri:)
+        @host = seed_uri.host
+        @match = Nous.configuration.match
+        @keep_query = Nous.configuration.keep_query
       end
       def canonicalize(uri)
@@ -25,8 +25,8 @@ module Nous
         IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
       end
-      def same_host?(uri)
-        uri.is_a?(URI::HTTP) && uri.host == host
+      def same_host?(url)
+        url.http? && url.host == host
       end
       def matches_path?(path)

data/lib/nous/crawler.rb CHANGED Viewed

@@ -1,91 +1,36 @@
 # frozen_string_literal: true
-require "async"
-require "async/http/internet"
-require "nokogiri"
-require "uri"
 module Nous
   class Crawler < Command
-    class Error < Command::Error; end
+    class CrawlError < StandardError; end
-    def initialize(seed_url:)
+    def initialize(seed_url:, http_client: nil)
       @seed_url = seed_url
+      @http_client = http_client
+      parse_seed!
     end
     def call
-      suppress_async_warnings unless config.verbose?
-      pages = []
-      queue = [url_filter.canonicalize(config.seed)]
-      seen = Set.new(queue)
-      Async do
-        client = Async::HTTP::Internet.new
-        begin
-          crawl(queue:, seen:, pages:, client:)
-        ensure
-          client.close
-        end
-      end.wait
-      success(payload: pages)
+      if config.recursive?
+        RecursivePageFetcher.call(seed_url:, http_client:)
+      else
+        SinglePageFetcher.call(url: seed_url, http_client:)
+      end
     end
     private
-    attr_reader :seed_url
+    attr_reader :seed_url, :http_client
     def config
       Nous.configuration
     end
-    def crawl(queue:, seen:, pages:, client:)
-      while queue.any? && pages.length < config.limit
-        batch = queue.shift(config.concurrency)
-        fetch_batch(batch, client).each do |page|
-          next unless page
-          pages << page
-          break if pages.length >= config.limit
-          link_extractor.extract(page[:url], page[:html]).each do |url|
-            next if seen.include?(url)
-            seen << url
-            queue << url
-          end
-        end
-      end
-    end
-    def fetch_batch(urls, client)
-      tasks = []
-      Async do |task|
-        urls.each do |url|
-          tasks << task.async { page_fetcher(client).fetch(url) }
-        end
-      end.wait
-      tasks.map(&:wait)
-    end
-    def url_filter
-      @url_filter ||= UrlFilter.new(config)
-    end
-    def link_extractor
-      @link_extractor ||= LinkExtractor.new(url_filter:)
-    end
-    def page_fetcher(client)
-      PageFetcher.new(client:)
-    end
-    def suppress_async_warnings
-      require "console"
-      Console.logger.level = :error
+    def parse_seed!
+      parsed = Url.new(seed_url)
+      raise CrawlError, "seed URL must be http or https" unless parsed.http?
+    rescue ArgumentError => e
+      raise CrawlError, "invalid seed URL: #{e.message}"
     end
   end
 end