RubyGems - curlyq - Versions diffs - 0.0.2 - Mend

curlyq 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/bin/curlyq ADDED Viewed

@@ -0,0 +1,477 @@
+#!/usr/bin/env ruby
+require 'gli'
+require 'curly'
+require 'curly/curl'
+include GLI::App
+program_desc 'A scriptable interface to curl'
+version Curly::VERSION
+subcommand_option_handling :normal
+arguments :strict
+ImageType = Class.new(Symbol)
+accept ImageType do |value|
+  value.normalize_image_type(:all)
+end
+BrowserType = Class.new(Symbol)
+accept BrowserType do |value|
+  value.normalize_browser_type(:none)
+end
+ScreenshotType = Class.new(Symbol)
+accept ScreenshotType do |value|
+  value.normalize_screenshot_type(:full_page)
+end
+desc 'Output YAML instead of json'
+switch %i[y yaml]
+desc 'Output "pretty" JSON'
+switch %i[pretty], default_value: true, negatable: true
+# TODO: Post method, html and json with --data flags, accept key=value and files
+# TODO: Handle binary responses, deal gracefully with compressed data
+# TODO: File uploads?
+def self.break_headers(headers)
+  out = {}
+  headers.each do |h|
+    m = h.match(/(?<key>[^=]+)=(?<value>.*?)$/)
+    out[m['key'].strip] = m['value'].strip
+  end
+  out
+end
+def self.print_out(output, yaml, raw: false, pretty: true)
+  output = output.to_data if output.respond_to?(:to_data)
+  # Was intended to flatten single responses, but not getting an array back is unpredictable
+  # output = output[0] if output&.is_a?(Array) && output.count == 1
+  if output.is_a?(String)
+    print output
+  elsif raw
+    output = output.join("\n") if output.is_a?(Array)
+    print output
+  else
+    if yaml
+      print YAML.dump(output)
+    else
+      print pretty ? JSON.pretty_generate(output) : JSON.generate(output)
+    end
+  end
+end
+desc 'Curl URL and output its elements, multiple URLs allowed'
+arg_name 'URL', multiple: true
+command %i[html curl] do |c|
+  c.desc 'Only retrieve headers/info'
+  c.switch %i[I info], negatable: false
+  c.desc 'Regurn an array of matches to a CSS or XPath query'
+  c.flag %i[search]
+  c.desc 'Define a header to send as "key=value"'
+  c.flag %i[h header], multiple: true
+  c.desc 'Use a browser to retrieve a dynamic web page (firefox, chrome)'
+  c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/
+  c.desc %(If curl doesn't work, use a fallback browser (firefox, chrome))
+  c.flag %i[f fallback], type: BrowserType, must_match: /^[fc].*?$/
+  c.desc 'Expect compressed results'
+  c.switch %i[c compressed], negatable: false
+  c.desc 'Remove extra whitespace from results'
+  c.switch %i[clean]
+  c.desc 'Filter output using dot-syntax path'
+  c.flag %i[q query filter]
+  c.desc 'Output a raw value for a key'
+  c.flag %i[r raw]
+  c.desc 'Ignore relative hrefs when gathering content links'
+  c.switch %i[ignore_relative], negatable: true
+  c.desc 'Ignore fragment hrefs when gathering content links'
+  c.switch %i[ignore_fragments], negatable: true
+  c.desc 'Only gather external links'
+  c.switch %i[x external_links_only], default_value: false, negatable: false
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    headers = break_headers(options[:header])
+    output = []
+    urls.each do |url|
+      res = Curl::Html.new(url, browser: options[:browser], fallback: options[:fallback],
+                                headers: headers, headers_only: options[:info],
+                                compressed: options[:compressed], clean: options[:clean],
+                                ignore_local_links: options[:ignore_relative],
+                                ignore_fragment_links: options[:ignore_fragments],
+                                external_links_only: options[:external_links_only])
+      if options[:info]
+        output.push(res.headers)
+        # print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
+        next
+      end
+      if options[:search]
+        out = res.search(options[:search])
+        out = out.dot_query(options[:query]) if options[:query]
+        output.push(out)
+      elsif options[:query]
+        queried = res.to_data.dot_query(options[:query])
+        output.push(queried) if queried
+      else
+        output.push(res.to_data(url: url))
+      end
+    end
+    output.delete_if(&:empty?)
+    output = output[0] if output.count == 1
+    output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
+    print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
+  end
+end
+desc 'Save a screenshot of a URL'
+arg_name 'URL', multiple: true
+command :screenshot do |c|
+  c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
+  c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'full'
+  c.desc 'Browser to use (firefox, chrome)'
+  c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
+  c.desc 'File destination'
+  c.flag %i[o out file]
+  c.action do |_, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    urls.each do |url|
+      c = Curl::Html.new(url)
+      c.screenshot(options[:out], browser: options[:browser], type: options[:type])
+    end
+  end
+end
+desc 'Get a JSON response from a URL, multiple URLs allowed'
+arg_name 'URL', multiple: true
+command :json do |c|
+  c.desc 'Define a header to send as key=value'
+  c.flag %i[h header], multiple: true
+  c.desc 'Expect compressed results'
+  c.switch %i[c compressed]
+  c.desc 'Filter output using dot-syntax path'
+  c.flag %i[q query filter]
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    headers = break_headers(options[:header])
+    output = []
+    urls.each do |url|
+      res = Curl::Json.new(url, headers: headers, compressed: options[:compressed], symbolize_names: false)
+      json = res.json
+      if json.nil?
+        output.push({
+          status: 'error parsing JSON',
+          url: res.url,
+          code: res.code,
+          headers: res.headers
+        })
+      else
+        json = json.dot_query(options[:query]) if options[:query]
+        output.push(json)
+      end
+    end
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc 'Extract contents between two regular expressions'
+arg_name 'URL', multiple: true
+command :extract do |c|
+  c.desc 'Text before extraction, parsed as regex'
+  c.flag %i[b before]
+  c.desc 'Text after extraction, parsed as regex'
+  c.flag %i[a after]
+  c.desc 'Define a header to send as key=value'
+  c.flag %i[h header], multiple: true
+  c.desc 'Expect compressed results'
+  c.switch %i[c compressed]
+  c.desc 'Remove extra whitespace from results'
+  c.switch %i[clean]
+  c.desc 'Strip HTML tags from results'
+  c.switch %i[strip]
+  c.action do |global_options,options,args|
+    urls = args.join(' ').split(/[, ]+/)
+    headers = break_headers(options[:header])
+    output = []
+    urls.each do |url|
+      res = Curl::Html.new(url, headers: headers, headers_only: false,
+                                compressed: options[:compressed], clean: options[:clean])
+      extracted = res.extract(options[:before], options[:after])
+      extracted.strip_tags! if options[:strip]
+      output.concat(extracted)
+    end
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc 'Extract all instances of a tag'
+arg_name 'URL', multiple: true
+command :tags do |c|
+  c.desc 'Define a header to send as key=value'
+  c.flag %i[h header], multiple: true
+  c.desc 'Specify a tag to collect'
+  c.flag %i[t tag], multiple: true
+  c.desc 'Expect compressed results'
+  c.switch %i[c compressed]
+  c.desc 'Remove extra whitespace from results'
+  c.switch %i[clean]
+  c.desc 'CSS/XPath query'
+  c.flag %i[q query search]
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    headers = break_headers(options[:header])
+    tags = options[:tag].join(' ').split(/[, ]+/)
+    output = []
+    urls.each do |url|
+      res = Curl::Html.new(url, headers: headers, headers_only: options[:headers],
+                                compressed: options[:compressed], clean: options[:clean])
+      output = []
+      if options[:search]
+        output = res.tags.search(options[:search])
+      elsif tags.count.positive?
+        tags.each { |tag| output.concat(res.tags(tag)) }
+      else
+        output.concat(res.tags)
+      end
+    end
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc 'Extract all images from a URL'
+arg_name 'URL', multiple: true
+command :images do |c|
+  c.desc 'Type of images to return (img, srcset, opengraph, all)'
+  c.flag %i[t type], multiple: true, type: ImageType, default_value: ['all']
+  c.desc 'Expect compressed results'
+  c.switch %i[c compressed]
+  c.desc 'Remove extra whitespace from results'
+  c.switch %i[clean]
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    output = []
+    types = options[:type].join(' ').split(/[ ,]+/).map(&:normalize_image_type)
+    urls.each do |url|
+      res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
+      output.concat(res.images(types: types))
+    end
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc %(Return all links on a URL's page)
+arg_name 'URL', multiple: true
+command :links do |c|
+  c.desc 'Ignore relative hrefs when gathering content links'
+  c.switch %i[ignore_relative], negatable: true
+  c.desc 'Ignore fragment hrefs when gathering content links'
+  c.switch %i[ignore_fragments], negatable: true
+  c.desc 'Only gather external links'
+  c.switch %i[x external_links_only], default_value: false, negatable: false
+  c.desc 'Filter output using dot-syntax path'
+  c.flag %i[q query filter]
+  c.desc 'Filter out duplicate links, preserving only first one'
+  c.switch %i[d dedup], negatable: true
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    output = []
+    urls.each do |url|
+      res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean],
+                                ignore_local_links: options[:ignore_relative],
+                                ignore_fragment_links: options[:ignore_fragments],
+                                external_links_only: options[:external_links_only])
+      if options[:query]
+        query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
+        queried = { links: res.to_data[:links] }.dot_query(query)
+        output.concat(queried) if queried
+      else
+        output.concat(res.body_links)
+      end
+    end
+    output.dedup_links! if options[:dedup]
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc %(Return all <head> links on URL's page)
+arg_name 'URL', multiple: true
+command :headlinks do |c|
+  c.desc 'Filter output using dot-syntax path'
+  c.flag %i[q query filter]
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    output = []
+    urls.each do |url|
+      res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
+      if options[:query]
+        query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
+        queried = { links: res.to_data[:meta_links] }.dot_query(query)
+        output.concat(queried) if queried
+      else
+        output.push(res.to_data[:meta_links])
+      end
+    end
+    output = output[0] if output.count == 1
+    print_out(output, global_options[:yaml], pretty: global_options[:pretty])
+  end
+end
+desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.)
+arg_name 'URL', multiple: true
+command :scrape do |c|
+  c.desc 'Browser to use (firefox, chrome)'
+  c.flag %i[b browser], type: BrowserType
+  c.desc 'Regurn an array of matches to a CSS or XPath query'
+  c.flag %i[search]
+  c.desc 'Define a header to send as "key=value"'
+  c.flag %i[h header], multiple: true
+  c.desc 'Remove extra whitespace from results'
+  c.switch %i[clean]
+  c.desc 'Filter output using dot-syntax path'
+  c.flag %i[q query filter]
+  c.desc 'Output a raw value for a key'
+  c.flag %i[r raw]
+  c.action do |global_options, options, args|
+    urls = args.join(' ').split(/[, ]+/)
+    output = []
+    urls.each do |url|
+      driver = Selenium::WebDriver.for options[:browser]
+      begin
+        driver.get url
+        res = driver.page_source
+        res = Curl::Html.new(nil, source: res, clean: options[:clean])
+        if options[:search]
+          out = res.search(options[:search])
+          out = out.dot_query(options[:query]) if options[:query]
+          output.push(out)
+        elsif options[:query]
+          queried = res.to_data(url: url).dot_query(options[:query])
+          output = queried if queried
+        else
+          output.push(res.to_data(url: url))
+        end
+        # elements = driver.find_elements(css: options[:query])
+        # elements.each { |e| output.push(e.text.strip) }
+      ensure
+        driver.quit
+      end
+    end
+    output.delete_if(&:empty?)
+    output = output[0] if output.count == 1
+    if options[:raw]
+      output.map! { |o| o[options[:raw].to_sym] }
+    end
+    print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
+  end
+end
+pre do |global, command, options, args|
+  # Pre logic here
+  # Return true to proceed; false to abort and not call the
+  # chosen command
+  # Use skips_pre before a command to skip this block
+  # on that command only
+  true
+end
+post do |global, command, options, args|
+  # Post logic here
+  # Use skips_post before a command to skip this
+  # block on that command only
+end
+on_error do |exception|
+  # Error logic here
+  # return false to skip default error handling
+  true
+end
+exit run(ARGV)

data/curlyq.gemspec ADDED Viewed

@@ -0,0 +1,27 @@
+# Ensure we require the local version and not one we might have installed already
+require File.join([File.dirname(__FILE__),'lib','curly','version.rb'])
+spec = Gem::Specification.new do |s|
+  s.name = 'curlyq'
+  s.version = Curly::VERSION
+  s.author = 'Brett Terpstra'
+  s.email = 'me@brettterpstra.com'
+  s.homepage = 'https://brettterpstra.com'
+  s.platform = Gem::Platform::RUBY
+  s.licenses = 'MIT'
+  s.summary = 'A CLI helper for curl and web scraping'
+  s.files = `git ls-files`.split("
+")
+  s.require_paths << 'lib'
+  s.extra_rdoc_files = ['README.rdoc','curlyq.rdoc']
+  s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
+  s.bindir = 'bin'
+  s.executables << 'curlyq'
+  s.add_development_dependency('rake','~> 0.9.2')
+  s.add_development_dependency('rdoc', '~> 4.3')
+  s.add_development_dependency('minitest', '~> 5.14')
+  s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
+  s.add_runtime_dependency('gli','~> 2.21.0')
+  s.add_runtime_dependency('tty-which','~> 0.5.0')
+  s.add_runtime_dependency('nokogiri','~> 1.16.0')
+  s.add_runtime_dependency('selenium-webdriver', '~> 4.16.0')
+end