RubyGems - fetch_util - Versions diffs - 0.3.0 - Mend

fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +7 -0
data/.rspec +2 -0
data/.rubocop.yml +97 -0
data/CHANGELOG.md +48 -0
data/LICENSE.txt +21 -0
data/README.md +199 -0
data/Rakefile +18 -0
data/SKILL.md +92 -0
data/exe/fetch_util +6 -0
data/lib/fetch_util/assets/extract.js +1 -0
data/lib/fetch_util/assets/vendor/readability.js +2314 -0
data/lib/fetch_util/assets/vendor/turndown.js +974 -0
data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
data/lib/fetch_util/browser/navigation.rb +13 -0
data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
data/lib/fetch_util/browser/site_stabilization.rb +13 -0
data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
data/lib/fetch_util/browser/stabilization.rb +13 -0
data/lib/fetch_util/browser.rb +135 -0
data/lib/fetch_util/cli.rb +124 -0
data/lib/fetch_util/extractor.rb +56 -0
data/lib/fetch_util/fetcher.rb +242 -0
data/lib/fetch_util/parallel_fetcher.rb +97 -0
data/lib/fetch_util/raw_docs_fallback.rb +260 -0
data/lib/fetch_util/regulatory/cache_store.rb +92 -0
data/lib/fetch_util/regulatory/directives.rb +106 -0
data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
data/lib/fetch_util/regulatory/headers.rb +39 -0
data/lib/fetch_util/regulatory/http_client.rb +70 -0
data/lib/fetch_util/regulatory/human.rb +104 -0
data/lib/fetch_util/regulatory/orchestration.rb +82 -0
data/lib/fetch_util/regulatory/page.rb +70 -0
data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
data/lib/fetch_util/regulatory/robots.rb +117 -0
data/lib/fetch_util/regulatory/signals.rb +106 -0
data/lib/fetch_util/regulatory/source_selection.rb +60 -0
data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
data/lib/fetch_util/regulatory.rb +74 -0
data/lib/fetch_util/request_log.rb +24 -0
data/lib/fetch_util/result.rb +58 -0
data/lib/fetch_util/searcher/result_filtering.rb +102 -0
data/lib/fetch_util/searcher.rb +332 -0
data/lib/fetch_util/version.rb +5 -0
data/lib/fetch_util.rb +115 -0
metadata +145 -0

data/lib/fetch_util/cli.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+require "json"
+require "thor"
+module FetchUtil
+  class CLI < Thor
+    DEFAULT_FETCH_FIELDS = %i[
+      url
+      final_url
+      canonical_url
+      title
+      byline
+      site_name
+      published_time
+      markdown
+      content_type
+      suspect
+      warnings
+    ].freeze
+    class_option :log_path, type: :string, desc: "Append-only request log path"
+    class_option :format, type: :string, default: "markdown", enum: %w[markdown json jsonl], desc: "Output format"
+    class_option :timeout, type: :numeric, default: 20
+    class_option :wait, type: :numeric, default: 0.75
+    class_option :concurrency, type: :numeric, default: 4
+    class_option :reader_mode, type: :boolean, default: true
+    class_option :wait_for_idle, type: :boolean, default: true
+    class_option :include_html, type: :boolean, default: false, desc: "Include raw html in fetch output"
+    desc "version", "Display fetch_util version"
+    def version
+      puts FetchUtil::VERSION
+    end
+    desc "fetch URL [URL...]", "Fetch one or more URLs"
+    def fetch(*urls)
+      raise ArgumentError, "at least one URL is required" if urls.empty?
+      results = if urls.length == 1
+                  [FetchUtil.fetch(urls.first, **fetch_options, request_log: request_log)]
+                else
+                  FetchUtil.fetch_many(urls, **fetch_options, request_log: request_log, concurrency: options[:concurrency])
+                end
+      if options[:format] == "markdown"
+        results.each_with_index do |result, index|
+          puts "\n---\n\n" if index > 0
+          puts result.markdown
+        end
+      else
+        emit(urls.length == 1 && options[:format] == "json" ? result_payload(results.first) : results.map { |result| result_payload(result) })
+      end
+    end
+    desc "search QUERY", "Search across configured engines and aggregate results"
+    option :source, type: :array, default: FetchUtil::Searcher::DEFAULT_SOURCES, desc: "Search sources"
+    option :limit, type: :numeric, default: 10
+    option :verbose_search, type: :boolean, default: false, desc: "Include per-result search provenance"
+    def search(*terms)
+      query = terms.join(" ").strip
+      raise ArgumentError, "query is required" if query.empty?
+      payload = Searcher.new(
+        request_log: request_log,
+        sources: options[:source],
+        limit: options[:limit],
+        concurrency: [options[:concurrency], options[:source].length].min,
+        verbose: options[:verbose_search],
+        **fetch_options
+      ).search(query)
+      emit(payload)
+    end
+    desc "regulatory URL", "Inspect regulatory crawl, index, and TDM signals for one URL"
+    option :sources, type: :string, default: "machine", desc: "Comma-separated source selectors, e.g. machine,-robotstxt or human,machine,-human"
+    option :cache_path, type: :string, desc: "Structured regulatory cache directory"
+    def regulatory(url)
+      raise ArgumentError, "url is required" if url.to_s.strip.empty?
+      request_log.append("regulatory://#{url}?sources=#{options[:sources]}")
+      payload = FetchUtil.regulatory(
+        url,
+        cache_path: options[:cache_path],
+        sources: options[:sources],
+        timeout: options[:timeout]
+      )
+      emit(payload)
+    end
+    no_commands do
+      def request_log
+        @request_log ||= RequestLog.new(path: options[:log_path] || ENV.fetch("FETCH_UTIL_REQUEST_LOG", RequestLog::DEFAULT_PATH))
+      end
+      def fetch_options
+        {
+          timeout: options[:timeout],
+          wait: options[:wait],
+          wait_for_idle: options[:wait_for_idle],
+          reader_mode: options[:reader_mode]
+        }
+      end
+      def result_payload(result)
+        payload = result.to_h
+        payload = payload.select { |key, _value| DEFAULT_FETCH_FIELDS.include?(key) }
+        payload[:html] = result.html if options[:include_html]
+        payload.reject { |_key, value| value.nil? || value == "" }
+      end
+      def emit(payload)
+        if options[:format] == "jsonl" && payload.is_a?(Array)
+          payload.each { |item| puts JSON.generate(item) }
+        else
+          puts JSON.generate(payload)
+        end
+      end
+    end
+  end
+end

data/lib/fetch_util/extractor.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+require "json"
+module FetchUtil
+  class Extractor
+    def initialize(reader_mode: true, asset_root: nil)
+      @reader_mode = reader_mode
+      @asset_root = asset_root || File.join(__dir__, "assets")
+    end
+    def extract(page)
+      payload = extract_payload(page)
+      raise ExtractionError, "Page extraction returned no content" unless payload.is_a?(Hash)
+      payload
+    rescue Ferrum::JavaScriptError, Ferrum::StatusError, Ferrum::TimeoutError => e
+      raise ExtractionError, e.message
+    end
+    private
+    def inject_assets(page)
+      page.add_script_tag(path: asset_path("vendor/readability.js"))
+      page.add_script_tag(path: asset_path("vendor/turndown.js"))
+      page.add_script_tag(path: asset_path("extract.js"))
+    end
+    def inject_assets_inline(page)
+      %w[vendor/readability.js vendor/turndown.js extract.js].each do |relative_path|
+        script = File.read(asset_path(relative_path), encoding: "UTF-8")
+        page.evaluate("#{script}\ntrue")
+      end
+    end
+    def extract_payload(page)
+      inject_assets(page)
+      page.evaluate(extraction_call)
+    rescue Ferrum::TimeoutError
+      begin
+        page.evaluate("window.stop && window.stop()")
+      rescue Ferrum::Error
+      end
+      inject_assets_inline(page)
+      page.evaluate(extraction_call)
+    end
+    def extraction_call
+      "window.FetchUtilExtract.extract(#{JSON.generate(reader_mode: @reader_mode)})"
+    end
+    def asset_path(relative_path)
+      File.join(@asset_root, relative_path)
+    end
+  end
+end

data/lib/fetch_util/fetcher.rb ADDED Viewed

@@ -0,0 +1,242 @@
+# frozen_string_literal: true
+require "uri"
+module FetchUtil
+  class Fetcher
+    HOMEPAGE_INDEX_PATTERN = Regexp.new(
+      "top stories|breaking news|latest news|headlines|" \
+      "aktuelle nachrichten|schlagzeilen|neueste nachrichten|" \
+      "à la une|dernières nouvelles|actualités|últimas noticias|" \
+      "noticias principales|notizie principali|ultime notizie|" \
+      "najnowsze wiadomości|najważniejsze|ostatnie wiadomości|aktualności|" \
+      "actualiteit|laatste nieuws|senaste nyheter|seneste nyheder|" \
+      "siste nytt|tuoreimmat uutiset|aktuálně|legfrissebb|" \
+      "cele mai noi știri|aktualności|најновије вести|останні новини|" \
+      "τελευταία νέα|güncel haberler|son dakika|senaste nyheterna|" \
+      "viktigaste nyheterna|aktualitātes|jaunākās ziņas|naujienos|" \
+      "svarbiausios naujienos|главные новости|últimas notícias|" \
+      "najnovšie správy|najnovije vijesti|derniers articles",
+      Regexp::IGNORECASE
+    ).freeze
+    DOCS_PORTAL_TITLE_PATTERN = /documentation|docs|the ultimate server/i
+    STRIPPED_QUERY_PARAM_PATTERNS = [
+      /\A(?:__goaway_|__cf_chl_)/,
+      /\A(?:utm_[a-z]+|fbclid|gclid|mc_cid|mc_eid)\z/,
+      /\A__gr(?:sc|ts|ua|rn)\z/
+    ].freeze
+    SECOND_LEVEL_COUNTRY_TLDS = /\A(co|com|org|net|gov|edu|ac)\z/
+    GOOGLE_HOST_PATTERN = /\Agoogle\.[a-z.]+\z/
+    def initialize(browser: nil, extractor: nil, **options)
+      @timeout = options.fetch(:timeout, 20)
+      @browser = browser || Browser.new(**browser_options(options))
+      @extractor = extractor || Extractor.new(reader_mode: options.fetch(:reader_mode, true))
+      @raw_docs_fallback = options[:raw_docs_fallback] || RawDocsFallback.new(timeout: @timeout)
+      @request_log = options[:request_log]
+    end
+    def quit
+      @browser.quit
+    end
+    def fetch(url)
+      t0 = monotonic_now
+      result = @browser.with_page(url) do |page|
+        payload = @extractor.extract(page)
+        build_result(url, page.current_url, payload)
+      end
+      fallback = docs_fallback_candidate?(url, result) && poor_docs_result?(result) ? @raw_docs_fallback.fetch(url) : nil
+      result = fallback_result(url, fallback) if fallback
+      log_request(url, t0)
+      result
+    rescue BrowserError, ExtractionError => e
+      fallback = docs_fallback_candidate?(url) ? @raw_docs_fallback.fetch(url) : nil
+      if fallback
+        result = fallback_result(url, fallback)
+        log_request(url, t0)
+        return result
+      end
+      log_request(url, t0)
+      raise e
+    end
+    private
+    def build_result(url, final_url, payload)
+      final_url = normalized_result_url(final_url)
+      canonical_url = normalized_result_url(payload["canonicalUrl"])
+      homepage_like = homepage_like?(final_url)
+      content_type = resolved_content_type(homepage_like, payload)
+      warnings = resolved_warnings(content_type, homepage_like, payload, requested_url: url, final_url: final_url)
+      suspect = warnings.any?
+      completeness_ratio = payload["contentCompletenessRatio"]&.to_f || 1.0
+      content_format = payload["contentFormat"]
+      paywall_state = payload["paywallState"]
+      metadata = {
+        title: payload["title"],
+        byline: payload["byline"],
+        excerpt: payload["excerpt"],
+        site_name: payload["siteName"],
+        published_time: payload["publishedTime"],
+        canonical_url: canonical_url,
+        language: payload["language"],
+        content_url: final_url,
+        reader_mode: payload["readerMode"],
+        content_type: content_type,
+        suspect: suspect,
+        warnings: warnings,
+        content_completeness_ratio: completeness_ratio,
+        content_format: content_format,
+        paywall_state: paywall_state
+      }.freeze
+      Result.new(
+        url: url,
+        final_url: final_url,
+        title: payload["title"],
+        byline: payload["byline"],
+        excerpt: payload["excerpt"],
+        site_name: payload["siteName"],
+        published_time: payload["publishedTime"],
+        canonical_url: canonical_url,
+        language: payload["language"],
+        html: payload["html"],
+        markdown: payload["markdown"],
+        metadata: metadata,
+        reader_mode: payload["readerMode"],
+        content_type: content_type,
+        suspect: suspect,
+        warnings: warnings,
+        content_completeness_ratio: completeness_ratio,
+        content_format: content_format,
+        paywall_state: paywall_state
+      )
+    end
+    def resolved_content_type(homepage_like, payload)
+      content_type = payload["contentType"] || "article"
+      return content_type unless content_type == "article"
+      return "list" if homepage_like && homepage_index_markdown?(payload["title"], payload["markdown"])
+      content_type
+    end
+    def resolved_warnings(content_type, homepage_like, payload, requested_url: nil, final_url: nil)
+      warnings = Array(payload["warnings"]).dup
+      warnings << "homepage_index_page" if content_type == "list" && homepage_like
+      warnings << "cross_domain_redirect" if cross_domain_redirect?(requested_url, final_url)
+      warnings << "aggregator_redirect_url" if aggregator_url?(requested_url)
+      warnings.uniq
+    end
+    def homepage_like?(url)
+      path = URI.parse(url).path
+      path.nil? || path.empty? || path == "/"
+    rescue URI::InvalidURIError
+      false
+    end
+    def homepage_index_markdown?(title, markdown)
+      snippet = [title, markdown].compact.join(" ")
+      return false unless snippet.match?(HOMEPAGE_INDEX_PATTERN)
+      markdown.to_s.lines.grep(/^\s*(?:\d+\.\s+|[-*]\s+)/).count >= 3
+    end
+    def fallback_result(url, fallback)
+      build_result(url, *fallback)
+    end
+    def docs_fallback_candidate?(requested_url, result = nil)
+      candidates = [requested_url]
+      if result
+        candidates << result.final_url
+        candidates << result.canonical_url
+      end
+      candidates.compact.any? { |candidate| FetchUtil.docs_like_url?(candidate) }
+    end
+    def browser_options(options)
+      options.slice(:timeout, :wait, :wait_for_idle, :idle_duration, :viewport,
+                    :user_agent, :accept_language, :browser_path, :browser_options)
+    end
+    def log_request(url, t0)
+      @request_log&.append(url, duration: monotonic_now - t0)
+    end
+    def monotonic_now
+      Process.clock_gettime(Process::CLOCK_MONOTONIC)
+    end
+    def poor_docs_result?(result)
+      markdown = result.markdown.to_s
+      title = result.title.to_s
+      text_length = FetchUtil.normalize_whitespace(markdown).length
+      return true if result.warnings.include?("not_found_interstitial") || result.warnings.include?("empty_extraction") || result.warnings.include?("short_extraction")
+      return true if markdown.include?("Interstitial: requested page is unavailable")
+      return true if text_length < 160 && title.match?(DOCS_PORTAL_TITLE_PATTERN)
+      return true if title.match?(DOCS_PORTAL_TITLE_PATTERN) && markdown.scan(/^# /).length >= 2
+      false
+    end
+    def effective_domain(url)
+      host = FetchUtil.strip_www_host(url)
+      parts = host.split(".")
+      return host if parts.length <= 2
+      if parts.length >= 3 && parts[-2].match?(SECOND_LEVEL_COUNTRY_TLDS) && parts[-1].length == 2
+        parts.last(3).join(".")
+      else
+        parts.last(2).join(".")
+      end
+    rescue URI::InvalidURIError
+      nil
+    end
+    def cross_domain_redirect?(requested_url, final_url)
+      return false if requested_url.nil? || final_url.nil?
+      req_domain = effective_domain(requested_url)
+      fin_domain = effective_domain(final_url)
+      return false if req_domain.nil? || fin_domain.nil?
+      req_domain != fin_domain
+    end
+    def aggregator_url?(url)
+      return false if url.nil?
+      host = FetchUtil.strip_www_host(url)
+      path = URI.parse(url).path.to_s
+      return true if host == "news.google.com"
+      return true if host == "cdn.ampproject.org" || host.end_with?(".cdn.ampproject.org")
+      return true if host.match?(GOOGLE_HOST_PATTERN) && path == "/url"
+      false
+    rescue URI::InvalidURIError
+      false
+    end
+    def normalized_result_url(url)
+      return url if url.nil? || url.empty?
+      uri = URI.parse(url)
+      params = URI.decode_www_form(uri.query.to_s)
+      params.reject! { |key, _value| STRIPPED_QUERY_PARAM_PATTERNS.any? { |pattern| key.match?(pattern) } }
+      uri.query = params.empty? ? nil : URI.encode_www_form(params)
+      uri.to_s
+    rescue URI::InvalidURIError
+      url
+    end
+  end
+end

data/lib/fetch_util/parallel_fetcher.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+module FetchUtil
+  class ParallelFetcher
+    Failure = Struct.new(:index, :url, :error, keyword_init: true)
+    class ParallelFetchError < Error
+      attr_reader :failures, :results
+      def initialize(failures, results = nil)
+        @failures = failures.freeze
+        @results = results&.freeze
+        super(self.class.build_message(@failures))
+      end
+      def errors
+        @failures.map(&:error)
+      end
+      def self.build_message(failures)
+        preview = failures.first(3).map do |failure|
+          label = failure.url || "<initialization>"
+          "#{label} (#{failure.error.class}: #{failure.error.message})"
+        end.join(", ")
+        suffix = failures.length > 3 ? ", +#{failures.length - 3} more" : ""
+        "parallel fetch failed for #{failures.length} URLs: #{preview}#{suffix}"
+      end
+    end
+    DEFAULT_CONCURRENCY = 4
+    def initialize(fetcher_factory: nil, concurrency: DEFAULT_CONCURRENCY, **fetch_options)
+      @fetcher_factory = fetcher_factory || -> { Fetcher.new(**fetch_options) }
+      @concurrency = [concurrency.to_i, 1].max
+    end
+    def fetch(urls)
+      work = Array(urls).compact.map(&:to_s).reject(&:empty?)
+      return [] if work.empty?
+      jobs = Queue.new
+      failures = Queue.new
+      work.each_with_index { |url, index| jobs << [index, url] }
+      results = Array.new(work.length)
+      worker_count = [@concurrency, work.length].min
+      threads = Array.new(worker_count) do
+        Thread.new do
+          fetcher = @fetcher_factory.call
+          begin
+            loop do
+              begin
+                index, url = jobs.pop(true)
+              rescue ThreadError
+                break
+              end
+              begin
+                results[index] = fetcher.fetch(url)
+              rescue StandardError => e
+                failures << Failure.new(index: index, url: url, error: e)
+              end
+            end
+          ensure
+            fetcher.quit if fetcher.respond_to?(:quit)
+          end
+        rescue StandardError => e
+          failures << Failure.new(index: nil, url: nil, error: e)
+        end
+      end
+      threads.each(&:join)
+      raise_for_failures(drain_queue(failures), results)
+      results
+    end
+    private
+    def drain_queue(queue)
+      items = []
+      loop do
+        items << queue.pop(true)
+      rescue ThreadError
+        break
+      end
+      items
+    end
+    def raise_for_failures(failures, results)
+      return if failures.empty?
+      raise ParallelFetchError.new(failures, results)
+    end
+  end
+end