botz 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fc66e77b306d2024ecaf98a7de9cb68a4489a42d0ccd7435245b4d6a385161b
4
- data.tar.gz: f42f5b708de3b5e52ea29947a5d08cc31bcb0dadd89dcaa69b71de9b5ad001a9
3
+ metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
4
+ data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
5
5
  SHA512:
6
- metadata.gz: 4e8c1e31edde612bcf993fd470fc4249bfd236ffd2e3640cca22f1098d22a5c7b0bfa9ade04f0263c94df5faa9901b4743dcc5cc7f343512c9b6e079711a78f4
7
- data.tar.gz: 52e4dbb1fe4ac0f9b07654d012a51e3a17b86a8279fe99eb34bb445fbc9845589b8adce472a92f636609c938ef486a9b86f81f31a5831f7315afdabafca5a492
6
+ metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
7
+ data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- botz (0.2.0)
4
+ botz (0.4.0)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
data/bin/console CHANGED
@@ -14,7 +14,9 @@ def reload!
14
14
  ActiveSupport::DescendantsTracker.clear
15
15
  ActiveSupport::Reloader.reload!
16
16
  end
17
- Pry.start
18
17
 
19
- # require 'irb'
20
- # IRB.start(__FILE__)
18
+ if ARGV[0]
19
+ Botz.open(ARGV[0]).console
20
+ else
21
+ Pry.start
22
+ end
data/exe/botz CHANGED
@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
7
7
  when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
8
8
  when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
9
9
  when :shell then Botz.open(ARGV[1]).shell.function
10
+ when :new then Botz.open(ARGV[1]).shell.build
10
11
  when :console
11
12
  if ARGV[1].blank?
12
13
  Botz.console
data/lib/botz.rb CHANGED
@@ -15,8 +15,8 @@ module Botz
15
15
  autoload :Definition
16
16
  autoload :DefinitionFile
17
17
  autoload :Spider
18
- autoload :Downloader
19
- autoload :HtmlScraperMacro
18
+ autoload :Scraper
19
+ autoload :ResourceAccessor
20
20
 
21
21
  const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
22
22
 
@@ -29,7 +29,7 @@ module Botz
29
29
  ::Botz::DefinitionFile.open(filepath)
30
30
  end
31
31
 
32
- def self.define(name, domain:, &block)
32
+ def self.define(name, domain: nil, &block)
33
33
  crawler_definition = Class.new(::Botz::Definition, &block)
34
34
  crawler_definition.domain = domain
35
35
  crawler_class_name = name.to_s.camelize
data/lib/botz/console.rb CHANGED
@@ -18,16 +18,4 @@ class Botz::Console
18
18
  def reload!
19
19
  @definition_file&.eval_definition
20
20
  end
21
-
22
- def scraper(name, url, &block)
23
- scrapers[name.to_sym].call(url, &block)
24
- end
25
-
26
- def spider(name, url = nil, &block)
27
- if url
28
- spiders[name.to_sym].call(url, &block)
29
- else
30
- spiders[name.to_sym].call(&block)
31
- end
32
- end
33
21
  end
@@ -7,27 +7,26 @@ class Botz::Definition
7
7
  class_attribute :domain
8
8
  class_attribute :spiders, default: {}
9
9
  class_attribute :scrapers, default: {}
10
-
11
- Output = ->(result) { STDOUT.puts(result.to_json) }
10
+ class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
12
11
 
13
12
  def output(&block)
14
- remove_const(:Output)
15
- const_set(:Output, block)
13
+ self.output = block
16
14
  end
17
15
 
18
16
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
19
17
  class << self
20
- def scraper(name, as: :html, &block)
18
+ def scraper(name, encoding: nil, as: :html, &block)
21
19
  class_name = "#{name}_scraper".classify
22
- downloader = Botz::Downloader.new(as)
23
- binder_base = Botz.const_get "#{as}_scraper_macro".classify
20
+ accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
21
+ accessor = accessor_class.new(encoding: encoding)
22
+ binder_base = Botz::Scraper.const_get(as.to_s.classify)
24
23
  binder = Class.new(binder_base, &block)
25
24
  binder.define_singleton_method(:name) { class_name }
26
25
  crawler_class = self
27
26
  scraper_class = Class.new do
28
27
  define_singleton_method(:crawler_class) { crawler_class }
29
28
  define_singleton_method(:bind) do |url|
30
- downloader.call(url) do |resource|
29
+ accessor.call(url) do |resource|
31
30
  binder.new(scraper_class, resource)
32
31
  end
33
32
  end
@@ -37,12 +36,14 @@ class Botz::Definition
37
36
  scrapers[name] = scraper_class
38
37
  end
39
38
 
40
- def spider(name, start_url = nil, as: :html, &block)
41
- downloader = Botz::Downloader.new(as)
39
+ def spider(name, start_url = nil, encoding: nil, as: :html, &block)
40
+ accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
41
+ accessor = accessor_class.new(start_url: start_url, encoding: encoding)
42
42
  spider = Botz::Spider.new(&block)
43
43
  spider_class = Class.new do
44
+ define_singleton_method(:accessor) { accessor }
44
45
  define_singleton_method(:call) do |url = start_url, &spider_block|
45
- downloader.call(url) do |resource|
46
+ accessor.call(url) do |resource|
46
47
  spider.call(resource, &spider_block)
47
48
  end
48
49
  end
@@ -50,19 +51,6 @@ class Botz::Definition
50
51
  const_set("#{name}_spider".classify, spider_class)
51
52
  spiders[name] = spider_class
52
53
  end
53
-
54
- def before_context(url:)
55
- downloader = Botz::Downloader.new(:html)
56
- before_context_class = Class.new do
57
- define_singleton_method(:call) do
58
- downloader.call(url) do |page|
59
- yield(page)
60
- page.mech
61
- end
62
- end
63
- end
64
- const_set('before_context'.classify, before_context_class)
65
- end
66
54
  end
67
55
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
68
56
  end
@@ -6,7 +6,7 @@
6
6
  class Botz::DefinitionFile
7
7
  attr_reader :path
8
8
  attr_reader :definition
9
- delegate :spiders, :scrapers, :Output, to: :definition
9
+ delegate :spiders, :scrapers, :output, to: :definition
10
10
 
11
11
  def self.open(filepath)
12
12
  object = new(filepath)
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # This class is responsible for actually making a network connection and downloading hypertext
5
+ #
6
+ module Botz::ResourceAccessor
7
+ extend ActiveSupport::Autoload
8
+ autoload :Html
9
+ autoload :DirectHtml
10
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Nokogiri wrapper
5
+ #
6
+ class Botz::ResourceAccessor::DirectHtml
7
+ def initialize(encoding: nil)
8
+ @encoding = encoding
9
+ end
10
+
11
+ def call(html)
12
+ yield Nokogiri::HTML.parse(html)
13
+ end
14
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Mechanize wrapper
5
+ #
6
+ class Botz::ResourceAccessor::Html
7
+ USER_AGENT = [
8
+ 'Mozilla/5.0',
9
+ '(Macintosh; Intel Mac OS X 10_12_6)',
10
+ 'AppleWebKit/537.36',
11
+ '(KHTML, like Gecko)',
12
+ 'Chrome/64.0.3282.186',
13
+ 'Safari/537.36'
14
+ ].join(' ')
15
+
16
+ attr_reader :start_url
17
+ attr_reader :agent
18
+
19
+ def initialize(start_url: nil, encoding: nil)
20
+ @start_url = start_url
21
+ @agent = Mechanize.new
22
+ if encoding
23
+ @agent.default_encoding = encoding
24
+ @agent.force_default_encoding = true
25
+ end
26
+ @agent.user_agent = USER_AGENT
27
+ end
28
+
29
+ def call(url = @start_url, &block)
30
+ fail 'URL is undefined' if url.blank?
31
+
32
+ agent.get(url, &block)
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # scraper namespace
5
+ #
6
+ module Botz::Scraper
7
+ extend ActiveSupport::Autoload
8
+ autoload :Html
9
+ autoload :DirectHtml
10
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # direct resource to html scraping
5
+ #
6
+ class Botz::Scraper::DirectHtml
7
+ include ActiveModel::Model
8
+ include ActiveModel::Attributes
9
+
10
+ #
11
+ # Scraper error class
12
+ #
13
+ class Error < StandardError
14
+ def initialize(scraper_class, errors)
15
+ super("#{scraper_class} # #{errors.full_messages}")
16
+ end
17
+ end
18
+
19
+ attr_reader :scraper_class
20
+ attr_reader :html
21
+
22
+ def initialize(scraper_class, resource)
23
+ @scraper_class = scraper_class
24
+ @html = resource
25
+ end
26
+
27
+ class << self
28
+ def field_names
29
+ @field_names ||= []
30
+ end
31
+ end
32
+
33
+ def to_h
34
+ fetched_at = Time.current
35
+ fetched_on = fetched_at.beginning_of_day
36
+ timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
37
+ self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
38
+ end
39
+
40
+ def call
41
+ fail Error.new(scraper_class, errors) if invalid?
42
+
43
+ yield(to_h)
44
+ end
45
+
46
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
47
+ def self.field(name, path = nil, persist: true, &block)
48
+ if persist
49
+ field_names << name
50
+ case name
51
+ when /.*\?/
52
+ validates name, inclusion: { in: [true, false] }
53
+ else
54
+ validates name, presence: true, allow_blank: true
55
+ end
56
+ end
57
+
58
+ return define_method(name) { instance_exec(html, &block) } if path.nil?
59
+ return define_method(name) { html.search(path).text.strip } if block.nil?
60
+
61
+ define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
62
+ end
63
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
+ end
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # DSL for parsing html into objects
5
5
  #
6
- class Botz::HtmlScraperMacro
6
+ class Botz::Scraper::Html
7
7
  include ActiveModel::Model
8
8
  include ActiveModel::Attributes
9
9
 
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
24
24
  @scraper_class = scraper_class
25
25
  @url = resource.uri
26
26
  @html = resource
27
- @writer = writer
28
27
  end
29
28
 
30
29
  class << self
data/lib/botz/shell.rb CHANGED
@@ -17,9 +17,9 @@ class Botz::Shell
17
17
  while line = STDIN.gets
18
18
  url = line.strip
19
19
  begin
20
- command.call(url, &definition_file.Output)
21
- rescue
22
- STDERR.puts "ERROR #{command} #{url}"
20
+ command.call(url, &definition_file.output)
21
+ rescue => e
22
+ STDERR.puts "ERROR #{e}"
23
23
  end
24
24
  end
25
25
  end
@@ -47,4 +47,31 @@ class Botz::Shell
47
47
  }
48
48
  SHELL
49
49
  end
50
+
51
+ # rubocop:disable Metrics/MethodLength
52
+ def build(name)
53
+ File.open("#{name}.rb", 'w') do |f|
54
+ f.write <<~RUBY
55
+ # frozen_string_literal: true
56
+
57
+ Botz.define(:#{name}) do
58
+ spider(:example, 'http://example.com') do |html, yielder|
59
+ # yielder.call(url or resource)
60
+ end
61
+
62
+ scraper(:example) do
63
+ end
64
+ end
65
+ RUBY
66
+ end
67
+
68
+ File.open("#{name}.sh", 'w') do |f|
69
+ f.write <<~SHELL
70
+ #!/bin/bash
71
+ eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
72
+ spider example
73
+ SHELL
74
+ end
75
+ end
76
+ # rubocop:enable Metrics/MethodLength
50
77
  end
data/lib/botz/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Botz
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: botz
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-29 00:00:00.000000000 Z
11
+ date: 2019-07-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -151,8 +151,12 @@ files:
151
151
  - lib/botz/console.rb
152
152
  - lib/botz/definition.rb
153
153
  - lib/botz/definition_file.rb
154
- - lib/botz/downloader.rb
155
- - lib/botz/html_scraper_macro.rb
154
+ - lib/botz/resource_accessor.rb
155
+ - lib/botz/resource_accessor/direct_html.rb
156
+ - lib/botz/resource_accessor/html.rb
157
+ - lib/botz/scraper.rb
158
+ - lib/botz/scraper/direct_html.rb
159
+ - lib/botz/scraper/html.rb
156
160
  - lib/botz/shell.rb
157
161
  - lib/botz/spider.rb
158
162
  - lib/botz/version.rb
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # This class is responsible for actually making a network connection and downloading hypertext
5
- #
6
- class Botz::Downloader
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
- USER_AGENT = [
10
- 'Mozilla/5.0',
11
- '(Macintosh; Intel Mac OS X 10_12_6)',
12
- 'AppleWebKit/537.36',
13
- '(KHTML, like Gecko)',
14
- 'Chrome/64.0.3282.186',
15
- 'Safari/537.36'
16
- ].join(' ')
17
-
18
- html_loader = lambda do |ctx, url, block|
19
- block_result = nil
20
- agent = Mechanize.new
21
- agent.user_agent = USER_AGENT
22
- ctx&.call(agent)
23
- agent.get(url) { |page| block_result = block.call(page) }
24
- block_result
25
- end
26
-
27
- json_loader = lambda do |_ctx, url, block|
28
- block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
29
- end
30
-
31
- class_attribute :loaders, default: { html: html_loader, json: json_loader }
32
-
33
- attribute :context
34
- attribute :loader
35
-
36
- def initialize(name, context = nil)
37
- super(loader: loaders[name], context: context)
38
- end
39
-
40
- def call(url, &block)
41
- loader.call(context, url, block)
42
- end
43
- end