RubyGems - botz - Versions diffs - 0.3.0 → 0.4.0 - Mend

botz 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/bin/console +5 -3
data/exe/botz +1 -0
data/lib/botz.rb +3 -3
data/lib/botz/console.rb +0 -12
data/lib/botz/definition.rb +12 -24
data/lib/botz/definition_file.rb +1 -1
data/lib/botz/resource_accessor.rb +10 -0
data/lib/botz/resource_accessor/direct_html.rb +14 -0
data/lib/botz/resource_accessor/html.rb +34 -0
data/lib/botz/scraper.rb +10 -0
data/lib/botz/scraper/direct_html.rb +64 -0
data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} +1 -2
data/lib/botz/shell.rb +30 -3
data/lib/botz/version.rb +1 -1
metadata +8 -4
data/lib/botz/downloader.rb +0 -43

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9fc66e77b306d2024ecaf98a7de9cb68a4489a42d0ccd7435245b4d6a385161b
-  data.tar.gz: f42f5b708de3b5e52ea29947a5d08cc31bcb0dadd89dcaa69b71de9b5ad001a9
+  metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
+  data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
 SHA512:
-  metadata.gz: 4e8c1e31edde612bcf993fd470fc4249bfd236ffd2e3640cca22f1098d22a5c7b0bfa9ade04f0263c94df5faa9901b4743dcc5cc7f343512c9b6e079711a78f4
-  data.tar.gz: 52e4dbb1fe4ac0f9b07654d012a51e3a17b86a8279fe99eb34bb445fbc9845589b8adce472a92f636609c938ef486a9b86f81f31a5831f7315afdabafca5a492
+  metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
+  data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    botz (0.2.0)
+    botz (0.4.0)
       activemodel (~> 5.2)
       activesupport (~> 5.2)
       mechanize

data/bin/console CHANGED Viewed

@@ -14,7 +14,9 @@ def reload!
   ActiveSupport::DescendantsTracker.clear
   ActiveSupport::Reloader.reload!
 end
-Pry.start
-# require 'irb'
-# IRB.start(__FILE__)
+if ARGV[0]
+  Botz.open(ARGV[0]).console
+else
+  Pry.start
+end

data/exe/botz CHANGED Viewed

@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
 when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
 when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
 when :shell then Botz.open(ARGV[1]).shell.function
+when :new then Botz.open(ARGV[1]).shell.build
 when :console
   if ARGV[1].blank?
     Botz.console

data/lib/botz.rb CHANGED Viewed

@@ -15,8 +15,8 @@ module Botz
   autoload :Definition
   autoload :DefinitionFile
   autoload :Spider
-  autoload :Downloader
-  autoload :HtmlScraperMacro
+  autoload :Scraper
+  autoload :ResourceAccessor
   const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
@@ -29,7 +29,7 @@ module Botz
     ::Botz::DefinitionFile.open(filepath)
   end
-  def self.define(name, domain:, &block)
+  def self.define(name, domain: nil, &block)
     crawler_definition = Class.new(::Botz::Definition, &block)
     crawler_definition.domain = domain
     crawler_class_name = name.to_s.camelize

data/lib/botz/console.rb CHANGED Viewed

@@ -18,16 +18,4 @@ class Botz::Console
   def reload!
     @definition_file&.eval_definition
   end
-  def scraper(name, url, &block)
-    scrapers[name.to_sym].call(url, &block)
-  end
-  def spider(name, url = nil, &block)
-    if url
-      spiders[name.to_sym].call(url, &block)
-    else
-      spiders[name.to_sym].call(&block)
-    end
-  end
 end

data/lib/botz/definition.rb CHANGED Viewed

@@ -7,27 +7,26 @@ class Botz::Definition
   class_attribute :domain
   class_attribute :spiders, default: {}
   class_attribute :scrapers, default: {}
-  Output = ->(result) { STDOUT.puts(result.to_json) }
+  class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
   def output(&block)
-    remove_const(:Output)
-    const_set(:Output, block)
+    self.output = block
   end
   # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
   class << self
-    def scraper(name, as: :html, &block)
+    def scraper(name, encoding: nil, as: :html, &block)
       class_name = "#{name}_scraper".classify
-      downloader = Botz::Downloader.new(as)
-      binder_base = Botz.const_get "#{as}_scraper_macro".classify
+      accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
+      accessor = accessor_class.new(encoding: encoding)
+      binder_base = Botz::Scraper.const_get(as.to_s.classify)
       binder = Class.new(binder_base, &block)
       binder.define_singleton_method(:name) { class_name }
       crawler_class = self
       scraper_class = Class.new do
         define_singleton_method(:crawler_class) { crawler_class }
         define_singleton_method(:bind) do |url|
-          downloader.call(url) do |resource|
+          accessor.call(url) do |resource|
             binder.new(scraper_class, resource)
           end
         end
@@ -37,12 +36,14 @@ class Botz::Definition
       scrapers[name] = scraper_class
     end
-    def spider(name, start_url = nil, as: :html, &block)
-      downloader = Botz::Downloader.new(as)
+    def spider(name, start_url = nil, encoding: nil, as: :html, &block)
+      accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
+      accessor = accessor_class.new(start_url: start_url, encoding: encoding)
       spider = Botz::Spider.new(&block)
       spider_class = Class.new do
+        define_singleton_method(:accessor) { accessor }
         define_singleton_method(:call) do |url = start_url, &spider_block|
-          downloader.call(url) do |resource|
+          accessor.call(url) do |resource|
             spider.call(resource, &spider_block)
           end
         end
@@ -50,19 +51,6 @@ class Botz::Definition
       const_set("#{name}_spider".classify, spider_class)
       spiders[name] = spider_class
     end
-    def before_context(url:)
-      downloader = Botz::Downloader.new(:html)
-      before_context_class = Class.new do
-        define_singleton_method(:call) do
-          downloader.call(url) do |page|
-            yield(page)
-            page.mech
-          end
-        end
-      end
-      const_set('before_context'.classify, before_context_class)
-    end
   end
   # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
 end

data/lib/botz/definition_file.rb CHANGED Viewed

@@ -6,7 +6,7 @@
 class Botz::DefinitionFile
   attr_reader :path
   attr_reader :definition
-  delegate :spiders, :scrapers, :Output, to: :definition
+  delegate :spiders, :scrapers, :output, to: :definition
   def self.open(filepath)
     object = new(filepath)

data/lib/botz/resource_accessor.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+#
+# This class is responsible for actually making a network connection and downloading hypertext
+#
+module Botz::ResourceAccessor
+  extend ActiveSupport::Autoload
+  autoload :Html
+  autoload :DirectHtml
+end

data/lib/botz/resource_accessor/direct_html.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+#
+# Nokogiri wrapper
+#
+class Botz::ResourceAccessor::DirectHtml
+  def initialize(encoding: nil)
+    @encoding = encoding
+  end
+  def call(html)
+    yield Nokogiri::HTML.parse(html)
+  end
+end

data/lib/botz/resource_accessor/html.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+#
+# Mechanize wrapper
+#
+class Botz::ResourceAccessor::Html
+  USER_AGENT = [
+    'Mozilla/5.0',
+    '(Macintosh; Intel Mac OS X 10_12_6)',
+    'AppleWebKit/537.36',
+    '(KHTML, like Gecko)',
+    'Chrome/64.0.3282.186',
+    'Safari/537.36'
+  ].join(' ')
+  attr_reader :start_url
+  attr_reader :agent
+  def initialize(start_url: nil, encoding: nil)
+    @start_url = start_url
+    @agent = Mechanize.new
+    if encoding
+      @agent.default_encoding = encoding
+      @agent.force_default_encoding = true
+    end
+    @agent.user_agent = USER_AGENT
+  end
+  def call(url = @start_url, &block)
+    fail 'URL is undefined' if url.blank?
+    agent.get(url, &block)
+  end
+end

data/lib/botz/scraper.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+#
+# scraper namespace
+#
+module Botz::Scraper
+  extend ActiveSupport::Autoload
+  autoload :Html
+  autoload :DirectHtml
+end

data/lib/botz/scraper/direct_html.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+#
+# direct resource to html scraping
+#
+class Botz::Scraper::DirectHtml
+  include ActiveModel::Model
+  include ActiveModel::Attributes
+  #
+  # Scraper error class
+  #
+  class Error < StandardError
+    def initialize(scraper_class, errors)
+      super("#{scraper_class} # #{errors.full_messages}")
+    end
+  end
+  attr_reader :scraper_class
+  attr_reader :html
+  def initialize(scraper_class, resource)
+    @scraper_class = scraper_class
+    @html = resource
+  end
+  class << self
+    def field_names
+      @field_names ||= []
+    end
+  end
+  def to_h
+    fetched_at = Time.current
+    fetched_on = fetched_at.beginning_of_day
+    timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
+    self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
+  end
+  def call
+    fail Error.new(scraper_class, errors) if invalid?
+    yield(to_h)
+  end
+  # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+  def self.field(name, path = nil, persist: true, &block)
+    if persist
+      field_names << name
+      case name
+      when /.*\?/
+        validates name, inclusion: { in: [true, false] }
+      else
+        validates name, presence: true, allow_blank: true
+      end
+    end
+    return define_method(name) { instance_exec(html, &block) } if path.nil?
+    return define_method(name) { html.search(path).text.strip } if block.nil?
+    define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
+  end
+  # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+end

data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} RENAMED Viewed

@@ -3,7 +3,7 @@
 #
 # DSL for parsing html into objects
 #
-class Botz::HtmlScraperMacro
+class Botz::Scraper::Html
   include ActiveModel::Model
   include ActiveModel::Attributes
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
     @scraper_class = scraper_class
     @url = resource.uri
     @html = resource
-    @writer = writer
   end
   class << self

data/lib/botz/shell.rb CHANGED Viewed

@@ -17,9 +17,9 @@ class Botz::Shell
     while line = STDIN.gets
       url = line.strip
       begin
-        command.call(url, &definition_file.Output)
-      rescue
-        STDERR.puts "ERROR #{command} #{url}"
+        command.call(url, &definition_file.output)
+      rescue => e
+        STDERR.puts "ERROR #{e}"
       end
     end
   end
@@ -47,4 +47,31 @@ class Botz::Shell
       }
     SHELL
   end
+  # rubocop:disable Metrics/MethodLength
+  def build(name)
+    File.open("#{name}.rb", 'w') do |f|
+      f.write <<~RUBY
+        # frozen_string_literal: true
+        Botz.define(:#{name}) do
+          spider(:example, 'http://example.com') do |html, yielder|
+            #  yielder.call(url or resource)
+          end
+          scraper(:example) do
+          end
+        end
+      RUBY
+    end
+    File.open("#{name}.sh", 'w') do |f|
+      f.write <<~SHELL
+        #!/bin/bash
+        eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
+        spider example
+      SHELL
+    end
+  end
+  # rubocop:enable Metrics/MethodLength
 end

data/lib/botz/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Botz
-  VERSION = '0.3.0'
+  VERSION = '0.4.0'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: botz
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - aileron
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-06-29 00:00:00.000000000 Z
+date: 2019-07-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -151,8 +151,12 @@ files:
 - lib/botz/console.rb
 - lib/botz/definition.rb
 - lib/botz/definition_file.rb
-- lib/botz/downloader.rb
-- lib/botz/html_scraper_macro.rb
+- lib/botz/resource_accessor.rb
+- lib/botz/resource_accessor/direct_html.rb
+- lib/botz/resource_accessor/html.rb
+- lib/botz/scraper.rb
+- lib/botz/scraper/direct_html.rb
+- lib/botz/scraper/html.rb
 - lib/botz/shell.rb
 - lib/botz/spider.rb
 - lib/botz/version.rb

data/lib/botz/downloader.rb DELETED Viewed

@@ -1,43 +0,0 @@
-# frozen_string_literal: true
-#
-# This class is responsible for actually making a network connection and downloading hypertext
-#
-class Botz::Downloader
-  include ActiveModel::Model
-  include ActiveModel::Attributes
-  USER_AGENT = [
-    'Mozilla/5.0',
-    '(Macintosh; Intel Mac OS X 10_12_6)',
-    'AppleWebKit/537.36',
-    '(KHTML, like Gecko)',
-    'Chrome/64.0.3282.186',
-    'Safari/537.36'
-  ].join(' ')
-  html_loader = lambda do |ctx, url, block|
-    block_result = nil
-    agent = Mechanize.new
-    agent.user_agent = USER_AGENT
-    ctx&.call(agent)
-    agent.get(url) { |page| block_result = block.call(page) }
-    block_result
-  end
-  json_loader = lambda do |_ctx, url, block|
-    block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
-  end
-  class_attribute :loaders, default: { html: html_loader, json: json_loader }
-  attribute :context
-  attribute :loader
-  def initialize(name, context = nil)
-    super(loader: loaders[name], context: context)
-  end
-  def call(url, &block)
-    loader.call(context, url, block)
-  end
-end