botz 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fc66e77b306d2024ecaf98a7de9cb68a4489a42d0ccd7435245b4d6a385161b
4
- data.tar.gz: f42f5b708de3b5e52ea29947a5d08cc31bcb0dadd89dcaa69b71de9b5ad001a9
3
+ metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
4
+ data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
5
5
  SHA512:
6
- metadata.gz: 4e8c1e31edde612bcf993fd470fc4249bfd236ffd2e3640cca22f1098d22a5c7b0bfa9ade04f0263c94df5faa9901b4743dcc5cc7f343512c9b6e079711a78f4
7
- data.tar.gz: 52e4dbb1fe4ac0f9b07654d012a51e3a17b86a8279fe99eb34bb445fbc9845589b8adce472a92f636609c938ef486a9b86f81f31a5831f7315afdabafca5a492
6
+ metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
7
+ data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- botz (0.2.0)
4
+ botz (0.4.0)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
data/bin/console CHANGED
@@ -14,7 +14,9 @@ def reload!
14
14
  ActiveSupport::DescendantsTracker.clear
15
15
  ActiveSupport::Reloader.reload!
16
16
  end
17
- Pry.start
18
17
 
19
- # require 'irb'
20
- # IRB.start(__FILE__)
18
+ if ARGV[0]
19
+ Botz.open(ARGV[0]).console
20
+ else
21
+ Pry.start
22
+ end
data/exe/botz CHANGED
@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
7
7
  when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
8
8
  when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
9
9
  when :shell then Botz.open(ARGV[1]).shell.function
10
+ when :new then Botz.open(ARGV[1]).shell.build
10
11
  when :console
11
12
  if ARGV[1].blank?
12
13
  Botz.console
data/lib/botz.rb CHANGED
@@ -15,8 +15,8 @@ module Botz
15
15
  autoload :Definition
16
16
  autoload :DefinitionFile
17
17
  autoload :Spider
18
- autoload :Downloader
19
- autoload :HtmlScraperMacro
18
+ autoload :Scraper
19
+ autoload :ResourceAccessor
20
20
 
21
21
  const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
22
22
 
@@ -29,7 +29,7 @@ module Botz
29
29
  ::Botz::DefinitionFile.open(filepath)
30
30
  end
31
31
 
32
- def self.define(name, domain:, &block)
32
+ def self.define(name, domain: nil, &block)
33
33
  crawler_definition = Class.new(::Botz::Definition, &block)
34
34
  crawler_definition.domain = domain
35
35
  crawler_class_name = name.to_s.camelize
data/lib/botz/console.rb CHANGED
@@ -18,16 +18,4 @@ class Botz::Console
18
18
  def reload!
19
19
  @definition_file&.eval_definition
20
20
  end
21
-
22
- def scraper(name, url, &block)
23
- scrapers[name.to_sym].call(url, &block)
24
- end
25
-
26
- def spider(name, url = nil, &block)
27
- if url
28
- spiders[name.to_sym].call(url, &block)
29
- else
30
- spiders[name.to_sym].call(&block)
31
- end
32
- end
33
21
  end
@@ -7,27 +7,26 @@ class Botz::Definition
7
7
  class_attribute :domain
8
8
  class_attribute :spiders, default: {}
9
9
  class_attribute :scrapers, default: {}
10
-
11
- Output = ->(result) { STDOUT.puts(result.to_json) }
10
+ class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
12
11
 
13
12
  def output(&block)
14
- remove_const(:Output)
15
- const_set(:Output, block)
13
+ self.output = block
16
14
  end
17
15
 
18
16
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
19
17
  class << self
20
- def scraper(name, as: :html, &block)
18
+ def scraper(name, encoding: nil, as: :html, &block)
21
19
  class_name = "#{name}_scraper".classify
22
- downloader = Botz::Downloader.new(as)
23
- binder_base = Botz.const_get "#{as}_scraper_macro".classify
20
+ accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
21
+ accessor = accessor_class.new(encoding: encoding)
22
+ binder_base = Botz::Scraper.const_get(as.to_s.classify)
24
23
  binder = Class.new(binder_base, &block)
25
24
  binder.define_singleton_method(:name) { class_name }
26
25
  crawler_class = self
27
26
  scraper_class = Class.new do
28
27
  define_singleton_method(:crawler_class) { crawler_class }
29
28
  define_singleton_method(:bind) do |url|
30
- downloader.call(url) do |resource|
29
+ accessor.call(url) do |resource|
31
30
  binder.new(scraper_class, resource)
32
31
  end
33
32
  end
@@ -37,12 +36,14 @@ class Botz::Definition
37
36
  scrapers[name] = scraper_class
38
37
  end
39
38
 
40
- def spider(name, start_url = nil, as: :html, &block)
41
- downloader = Botz::Downloader.new(as)
39
+ def spider(name, start_url = nil, encoding: nil, as: :html, &block)
40
+ accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
41
+ accessor = accessor_class.new(start_url: start_url, encoding: encoding)
42
42
  spider = Botz::Spider.new(&block)
43
43
  spider_class = Class.new do
44
+ define_singleton_method(:accessor) { accessor }
44
45
  define_singleton_method(:call) do |url = start_url, &spider_block|
45
- downloader.call(url) do |resource|
46
+ accessor.call(url) do |resource|
46
47
  spider.call(resource, &spider_block)
47
48
  end
48
49
  end
@@ -50,19 +51,6 @@ class Botz::Definition
50
51
  const_set("#{name}_spider".classify, spider_class)
51
52
  spiders[name] = spider_class
52
53
  end
53
-
54
- def before_context(url:)
55
- downloader = Botz::Downloader.new(:html)
56
- before_context_class = Class.new do
57
- define_singleton_method(:call) do
58
- downloader.call(url) do |page|
59
- yield(page)
60
- page.mech
61
- end
62
- end
63
- end
64
- const_set('before_context'.classify, before_context_class)
65
- end
66
54
  end
67
55
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
68
56
  end
@@ -6,7 +6,7 @@
6
6
  class Botz::DefinitionFile
7
7
  attr_reader :path
8
8
  attr_reader :definition
9
- delegate :spiders, :scrapers, :Output, to: :definition
9
+ delegate :spiders, :scrapers, :output, to: :definition
10
10
 
11
11
  def self.open(filepath)
12
12
  object = new(filepath)
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # This class is responsible for actually making a network connection and downloading hypertext
5
+ #
6
+ module Botz::ResourceAccessor
7
+ extend ActiveSupport::Autoload
8
+ autoload :Html
9
+ autoload :DirectHtml
10
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Nokogiri wrapper
5
+ #
6
+ class Botz::ResourceAccessor::DirectHtml
7
+ def initialize(encoding: nil)
8
+ @encoding = encoding
9
+ end
10
+
11
+ def call(html)
12
+ yield Nokogiri::HTML.parse(html)
13
+ end
14
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Mechanize wrapper
5
+ #
6
+ class Botz::ResourceAccessor::Html
7
+ USER_AGENT = [
8
+ 'Mozilla/5.0',
9
+ '(Macintosh; Intel Mac OS X 10_12_6)',
10
+ 'AppleWebKit/537.36',
11
+ '(KHTML, like Gecko)',
12
+ 'Chrome/64.0.3282.186',
13
+ 'Safari/537.36'
14
+ ].join(' ')
15
+
16
+ attr_reader :start_url
17
+ attr_reader :agent
18
+
19
+ def initialize(start_url: nil, encoding: nil)
20
+ @start_url = start_url
21
+ @agent = Mechanize.new
22
+ if encoding
23
+ @agent.default_encoding = encoding
24
+ @agent.force_default_encoding = true
25
+ end
26
+ @agent.user_agent = USER_AGENT
27
+ end
28
+
29
+ def call(url = @start_url, &block)
30
+ fail 'URL is undefined' if url.blank?
31
+
32
+ agent.get(url, &block)
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # scraper namespace
5
+ #
6
+ module Botz::Scraper
7
+ extend ActiveSupport::Autoload
8
+ autoload :Html
9
+ autoload :DirectHtml
10
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # direct resource to html scraping
5
+ #
6
+ class Botz::Scraper::DirectHtml
7
+ include ActiveModel::Model
8
+ include ActiveModel::Attributes
9
+
10
+ #
11
+ # Scraper error class
12
+ #
13
+ class Error < StandardError
14
+ def initialize(scraper_class, errors)
15
+ super("#{scraper_class} # #{errors.full_messages}")
16
+ end
17
+ end
18
+
19
+ attr_reader :scraper_class
20
+ attr_reader :html
21
+
22
+ def initialize(scraper_class, resource)
23
+ @scraper_class = scraper_class
24
+ @html = resource
25
+ end
26
+
27
+ class << self
28
+ def field_names
29
+ @field_names ||= []
30
+ end
31
+ end
32
+
33
+ def to_h
34
+ fetched_at = Time.current
35
+ fetched_on = fetched_at.beginning_of_day
36
+ timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
37
+ self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
38
+ end
39
+
40
+ def call
41
+ fail Error.new(scraper_class, errors) if invalid?
42
+
43
+ yield(to_h)
44
+ end
45
+
46
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
47
+ def self.field(name, path = nil, persist: true, &block)
48
+ if persist
49
+ field_names << name
50
+ case name
51
+ when /.*\?/
52
+ validates name, inclusion: { in: [true, false] }
53
+ else
54
+ validates name, presence: true, allow_blank: true
55
+ end
56
+ end
57
+
58
+ return define_method(name) { instance_exec(html, &block) } if path.nil?
59
+ return define_method(name) { html.search(path).text.strip } if block.nil?
60
+
61
+ define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
62
+ end
63
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
+ end
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # DSL for parsing html into objects
5
5
  #
6
- class Botz::HtmlScraperMacro
6
+ class Botz::Scraper::Html
7
7
  include ActiveModel::Model
8
8
  include ActiveModel::Attributes
9
9
 
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
24
24
  @scraper_class = scraper_class
25
25
  @url = resource.uri
26
26
  @html = resource
27
- @writer = writer
28
27
  end
29
28
 
30
29
  class << self
data/lib/botz/shell.rb CHANGED
@@ -17,9 +17,9 @@ class Botz::Shell
17
17
  while line = STDIN.gets
18
18
  url = line.strip
19
19
  begin
20
- command.call(url, &definition_file.Output)
21
- rescue
22
- STDERR.puts "ERROR #{command} #{url}"
20
+ command.call(url, &definition_file.output)
21
+ rescue => e
22
+ STDERR.puts "ERROR #{e}"
23
23
  end
24
24
  end
25
25
  end
@@ -47,4 +47,31 @@ class Botz::Shell
47
47
  }
48
48
  SHELL
49
49
  end
50
+
51
+ # rubocop:disable Metrics/MethodLength
52
+ def build(name)
53
+ File.open("#{name}.rb", 'w') do |f|
54
+ f.write <<~RUBY
55
+ # frozen_string_literal: true
56
+
57
+ Botz.define(:#{name}) do
58
+ spider(:example, 'http://example.com') do |html, yielder|
59
+ # yielder.call(url or resource)
60
+ end
61
+
62
+ scraper(:example) do
63
+ end
64
+ end
65
+ RUBY
66
+ end
67
+
68
+ File.open("#{name}.sh", 'w') do |f|
69
+ f.write <<~SHELL
70
+ #!/bin/bash
71
+ eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
72
+ spider example
73
+ SHELL
74
+ end
75
+ end
76
+ # rubocop:enable Metrics/MethodLength
50
77
  end
data/lib/botz/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Botz
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: botz
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-29 00:00:00.000000000 Z
11
+ date: 2019-07-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -151,8 +151,12 @@ files:
151
151
  - lib/botz/console.rb
152
152
  - lib/botz/definition.rb
153
153
  - lib/botz/definition_file.rb
154
- - lib/botz/downloader.rb
155
- - lib/botz/html_scraper_macro.rb
154
+ - lib/botz/resource_accessor.rb
155
+ - lib/botz/resource_accessor/direct_html.rb
156
+ - lib/botz/resource_accessor/html.rb
157
+ - lib/botz/scraper.rb
158
+ - lib/botz/scraper/direct_html.rb
159
+ - lib/botz/scraper/html.rb
156
160
  - lib/botz/shell.rb
157
161
  - lib/botz/spider.rb
158
162
  - lib/botz/version.rb
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # This class is responsible for actually making a network connection and downloading hypertext
5
- #
6
- class Botz::Downloader
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
- USER_AGENT = [
10
- 'Mozilla/5.0',
11
- '(Macintosh; Intel Mac OS X 10_12_6)',
12
- 'AppleWebKit/537.36',
13
- '(KHTML, like Gecko)',
14
- 'Chrome/64.0.3282.186',
15
- 'Safari/537.36'
16
- ].join(' ')
17
-
18
- html_loader = lambda do |ctx, url, block|
19
- block_result = nil
20
- agent = Mechanize.new
21
- agent.user_agent = USER_AGENT
22
- ctx&.call(agent)
23
- agent.get(url) { |page| block_result = block.call(page) }
24
- block_result
25
- end
26
-
27
- json_loader = lambda do |_ctx, url, block|
28
- block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
29
- end
30
-
31
- class_attribute :loaders, default: { html: html_loader, json: json_loader }
32
-
33
- attribute :context
34
- attribute :loader
35
-
36
- def initialize(name, context = nil)
37
- super(loader: loaders[name], context: context)
38
- end
39
-
40
- def call(url, &block)
41
- loader.call(context, url, block)
42
- end
43
- end