RubyGems - botz - Versions diffs - 0.3.0 → 0.4.0 - Mend

botz 0.3.0 → 0.4.0

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/bin/console +5 -3
data/exe/botz +1 -0
data/lib/botz.rb +3 -3
data/lib/botz/console.rb +0 -12
data/lib/botz/definition.rb +12 -24
data/lib/botz/definition_file.rb +1 -1
data/lib/botz/resource_accessor.rb +10 -0
data/lib/botz/resource_accessor/direct_html.rb +14 -0
data/lib/botz/resource_accessor/html.rb +34 -0
data/lib/botz/scraper.rb +10 -0
data/lib/botz/scraper/direct_html.rb +64 -0
data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} +1 -2
data/lib/botz/shell.rb +30 -3
data/lib/botz/version.rb +1 -1
metadata +8 -4
data/lib/botz/downloader.rb +0 -43

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9fc66e77b306d2024ecaf98a7de9cb68a4489a42d0ccd7435245b4d6a385161b
-  data.tar.gz: f42f5b708de3b5e52ea29947a5d08cc31bcb0dadd89dcaa69b71de9b5ad001a9
+  metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
+  data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
 SHA512:
-  metadata.gz: 4e8c1e31edde612bcf993fd470fc4249bfd236ffd2e3640cca22f1098d22a5c7b0bfa9ade04f0263c94df5faa9901b4743dcc5cc7f343512c9b6e079711a78f4
-  data.tar.gz: 52e4dbb1fe4ac0f9b07654d012a51e3a17b86a8279fe99eb34bb445fbc9845589b8adce472a92f636609c938ef486a9b86f81f31a5831f7315afdabafca5a492
+  metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
+  data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    botz (0.2.0)
+    botz (0.4.0)
       activemodel (~> 5.2)
       activesupport (~> 5.2)
       mechanize

data/bin/console CHANGED Viewed

@@ -14,7 +14,9 @@ def reload!
   ActiveSupport::DescendantsTracker.clear
   ActiveSupport::Reloader.reload!
 end
-Pry.start
-# require 'irb'
-# IRB.start(__FILE__)
+if ARGV[0]
+  Botz.open(ARGV[0]).console
+else
+  Pry.start
+end

data/exe/botz CHANGED Viewed

@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
 when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
 when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
 when :shell then Botz.open(ARGV[1]).shell.function
+when :new then Botz.open(ARGV[1]).shell.build
 when :console
   if ARGV[1].blank?
     Botz.console

data/lib/botz.rb CHANGED Viewed

@@ -15,8 +15,8 @@ module Botz
   autoload :Definition
   autoload :DefinitionFile
   autoload :Spider
-  autoload :Downloader
-  autoload :HtmlScraperMacro
+  autoload :Scraper
+  autoload :ResourceAccessor
   const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
@@ -29,7 +29,7 @@ module Botz
     ::Botz::DefinitionFile.open(filepath)
   end
-  def self.define(name, domain:, &block)
+  def self.define(name, domain: nil, &block)
     crawler_definition = Class.new(::Botz::Definition, &block)
     crawler_definition.domain = domain
     crawler_class_name = name.to_s.camelize

data/lib/botz/console.rb CHANGED Viewed

@@ -18,16 +18,4 @@ class Botz::Console
   def reload!
     @definition_file&.eval_definition
   end
-  def scraper(name, url, &block)
-    scrapers[name.to_sym].call(url, &block)
-  end
-  def spider(name, url = nil, &block)
-    if url
-      spiders[name.to_sym].call(url, &block)
-    else
-      spiders[name.to_sym].call(&block)
-    end
-  end
 end

data/lib/botz/definition.rb CHANGED Viewed

@@ -7,27 +7,26 @@ class Botz::Definition
   class_attribute :domain
   class_attribute :spiders, default: {}
   class_attribute :scrapers, default: {}
-  Output = ->(result) { STDOUT.puts(result.to_json) }
+  class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
   def output(&block)
-    remove_const(:Output)
-    const_set(:Output, block)
+    self.output = block
   end
   # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
   class << self
-    def scraper(name, as: :html, &block)
+    def scraper(name, encoding: nil, as: :html, &block)
       class_name = "#{name}_scraper".classify
-      downloader = Botz::Downloader.new(as)
-      binder_base = Botz.const_get "#{as}_scraper_macro".classify
+      accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
+      accessor = accessor_class.new(encoding: encoding)
+      binder_base = Botz::Scraper.const_get(as.to_s.classify)
       binder = Class.new(binder_base, &block)
       binder.define_singleton_method(:name) { class_name }
       crawler_class = self
       scraper_class = Class.new do
         define_singleton_method(:crawler_class) { crawler_class }
         define_singleton_method(:bind) do |url|
-          downloader.call(url) do |resource|
+          accessor.call(url) do |resource|
             binder.new(scraper_class, resource)
           end
         end
@@ -37,12 +36,14 @@ class Botz::Definition
       scrapers[name] = scraper_class
     end
-    def spider(name, start_url = nil, as: :html, &block)
-      downloader = Botz::Downloader.new(as)
+    def spider(name, start_url = nil, encoding: nil, as: :html, &block)
+      accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
+      accessor = accessor_class.new(start_url: start_url, encoding: encoding)
       spider = Botz::Spider.new(&block)
       spider_class = Class.new do
+        define_singleton_method(:accessor) { accessor }
         define_singleton_method(:call) do |url = start_url, &spider_block|
-          downloader.call(url) do |resource|
+          accessor.call(url) do |resource|
             spider.call(resource, &spider_block)
           end
         end
@@ -50,19 +51,6 @@ class Botz::Definition
       const_set("#{name}_spider".classify, spider_class)
       spiders[name] = spider_class
     end
-    def before_context(url:)
-      downloader = Botz::Downloader.new(:html)
-      before_context_class = Class.new do
-        define_singleton_method(:call) do
-          downloader.call(url) do |page|
-            yield(page)
-            page.mech
-          end
-        end
-      end
-      const_set('before_context'.classify, before_context_class)
-    end
   end
   # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
 end

data/lib/botz/definition_file.rb CHANGED Viewed

@@ -6,7 +6,7 @@
 class Botz::DefinitionFile
   attr_reader :path
   attr_reader :definition
-  delegate :spiders, :scrapers, :Output, to: :definition
+  delegate :spiders, :scrapers, :output, to: :definition
   def self.open(filepath)
     object = new(filepath)

data/lib/botz/resource_accessor.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+#
+# This class is responsible for actually making a network connection and downloading hypertext
+#
+module Botz::ResourceAccessor
+  extend ActiveSupport::Autoload
+  autoload :Html
+  autoload :DirectHtml
+end

data/lib/botz/resource_accessor/direct_html.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+#
+# Nokogiri wrapper
+#
+class Botz::ResourceAccessor::DirectHtml
+  def initialize(encoding: nil)
+    @encoding = encoding
+  end
+  def call(html)
+    yield Nokogiri::HTML.parse(html)
+  end
+end

data/lib/botz/resource_accessor/html.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+#
+# Mechanize wrapper
+#
+class Botz::ResourceAccessor::Html
+  USER_AGENT = [
+    'Mozilla/5.0',
+    '(Macintosh; Intel Mac OS X 10_12_6)',
+    'AppleWebKit/537.36',
+    '(KHTML, like Gecko)',
+    'Chrome/64.0.3282.186',
+    'Safari/537.36'
+  ].join(' ')
+  attr_reader :start_url
+  attr_reader :agent
+  def initialize(start_url: nil, encoding: nil)
+    @start_url = start_url
+    @agent = Mechanize.new
+    if encoding
+      @agent.default_encoding = encoding
+      @agent.force_default_encoding = true
+    end
+    @agent.user_agent = USER_AGENT
+  end
+  def call(url = @start_url, &block)
+    fail 'URL is undefined' if url.blank?
+    agent.get(url, &block)
+  end
+end

data/lib/botz/scraper.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+#
+# scraper namespace
+#
+module Botz::Scraper
+  extend ActiveSupport::Autoload
+  autoload :Html
+  autoload :DirectHtml
+end

data/lib/botz/scraper/direct_html.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+#
+# direct resource to html scraping
+#
+class Botz::Scraper::DirectHtml
+  include ActiveModel::Model
+  include ActiveModel::Attributes
+  #
+  # Scraper error class
+  #
+  class Error < StandardError
+    def initialize(scraper_class, errors)
+      super("#{scraper_class} # #{errors.full_messages}")
+    end
+  end
+  attr_reader :scraper_class
+  attr_reader :html
+  def initialize(scraper_class, resource)
+    @scraper_class = scraper_class
+    @html = resource
+  end
+  class << self
+    def field_names
+      @field_names ||= []
+    end
+  end
+  def to_h
+    fetched_at = Time.current
+    fetched_on = fetched_at.beginning_of_day
+    timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
+    self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
+  end
+  def call
+    fail Error.new(scraper_class, errors) if invalid?
+    yield(to_h)
+  end
+  # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+  def self.field(name, path = nil, persist: true, &block)
+    if persist
+      field_names << name
+      case name
+      when /.*\?/
+        validates name, inclusion: { in: [true, false] }
+      else
+        validates name, presence: true, allow_blank: true
+      end
+    end
+    return define_method(name) { instance_exec(html, &block) } if path.nil?
+    return define_method(name) { html.search(path).text.strip } if block.nil?
+    define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
+  end
+  # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+end

data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} RENAMED Viewed

@@ -3,7 +3,7 @@
 #
 # DSL for parsing html into objects
 #
-class Botz::HtmlScraperMacro
+class Botz::Scraper::Html
   include ActiveModel::Model
   include ActiveModel::Attributes
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
     @scraper_class = scraper_class
     @url = resource.uri
     @html = resource
-    @writer = writer
   end
   class << self

data/lib/botz/shell.rb CHANGED Viewed

@@ -17,9 +17,9 @@ class Botz::Shell
     while line = STDIN.gets
       url = line.strip
       begin
-        command.call(url, &definition_file.Output)
-      rescue
-        STDERR.puts "ERROR #{command} #{url}"
+        command.call(url, &definition_file.output)
+      rescue => e
+        STDERR.puts "ERROR #{e}"
       end
     end
   end
@@ -47,4 +47,31 @@ class Botz::Shell
       }
     SHELL
   end
+  # rubocop:disable Metrics/MethodLength
+  def build(name)
+    File.open("#{name}.rb", 'w') do |f|
+      f.write <<~RUBY
+        # frozen_string_literal: true
+        Botz.define(:#{name}) do
+          spider(:example, 'http://example.com') do |html, yielder|
+            #  yielder.call(url or resource)
+          end
+          scraper(:example) do
+          end
+        end
+      RUBY
+    end
+    File.open("#{name}.sh", 'w') do |f|
+      f.write <<~SHELL
+        #!/bin/bash
+        eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
+        spider example
+      SHELL
+    end
+  end
+  # rubocop:enable Metrics/MethodLength
 end

data/lib/botz/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Botz
-  VERSION = '0.3.0'
+  VERSION = '0.4.0'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: botz
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - aileron
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-06-29 00:00:00.000000000 Z
+date: 2019-07-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -151,8 +151,12 @@ files:
 - lib/botz/console.rb
 - lib/botz/definition.rb
 - lib/botz/definition_file.rb
-- lib/botz/downloader.rb
-- lib/botz/html_scraper_macro.rb
+- lib/botz/resource_accessor.rb
+- lib/botz/resource_accessor/direct_html.rb
+- lib/botz/resource_accessor/html.rb
+- lib/botz/scraper.rb
+- lib/botz/scraper/direct_html.rb
+- lib/botz/scraper/html.rb
 - lib/botz/shell.rb
 - lib/botz/spider.rb
 - lib/botz/version.rb

data/lib/botz/downloader.rb DELETED Viewed

@@ -1,43 +0,0 @@
-# frozen_string_literal: true
-#
-# This class is responsible for actually making a network connection and downloading hypertext
-#
-class Botz::Downloader
-  include ActiveModel::Model
-  include ActiveModel::Attributes
-  USER_AGENT = [
-    'Mozilla/5.0',
-    '(Macintosh; Intel Mac OS X 10_12_6)',
-    'AppleWebKit/537.36',
-    '(KHTML, like Gecko)',
-    'Chrome/64.0.3282.186',
-    'Safari/537.36'
-  ].join(' ')
-  html_loader = lambda do |ctx, url, block|
-    block_result = nil
-    agent = Mechanize.new
-    agent.user_agent = USER_AGENT
-    ctx&.call(agent)
-    agent.get(url) { |page| block_result = block.call(page) }
-    block_result
-  end
-  json_loader = lambda do |_ctx, url, block|
-    block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
-  end
-  class_attribute :loaders, default: { html: html_loader, json: json_loader }
-  attribute :context
-  attribute :loader
-  def initialize(name, context = nil)
-    super(loader: loaders[name], context: context)
-  end
-  def call(url, &block)
-    loader.call(context, url, block)
-  end
-end