botz 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/bin/console +5 -3
- data/exe/botz +1 -0
- data/lib/botz.rb +3 -3
- data/lib/botz/console.rb +0 -12
- data/lib/botz/definition.rb +12 -24
- data/lib/botz/definition_file.rb +1 -1
- data/lib/botz/resource_accessor.rb +10 -0
- data/lib/botz/resource_accessor/direct_html.rb +14 -0
- data/lib/botz/resource_accessor/html.rb +34 -0
- data/lib/botz/scraper.rb +10 -0
- data/lib/botz/scraper/direct_html.rb +64 -0
- data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} +1 -2
- data/lib/botz/shell.rb +30 -3
- data/lib/botz/version.rb +1 -1
- metadata +8 -4
- data/lib/botz/downloader.rb +0 -43
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
|
4
|
+
data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
|
7
|
+
data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
|
data/Gemfile.lock
CHANGED
data/bin/console
CHANGED
data/exe/botz
CHANGED
@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
|
|
7
7
|
when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
|
8
8
|
when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
|
9
9
|
when :shell then Botz.open(ARGV[1]).shell.function
|
10
|
+
when :new then Botz.open(ARGV[1]).shell.build
|
10
11
|
when :console
|
11
12
|
if ARGV[1].blank?
|
12
13
|
Botz.console
|
data/lib/botz.rb
CHANGED
@@ -15,8 +15,8 @@ module Botz
|
|
15
15
|
autoload :Definition
|
16
16
|
autoload :DefinitionFile
|
17
17
|
autoload :Spider
|
18
|
-
autoload :
|
19
|
-
autoload :
|
18
|
+
autoload :Scraper
|
19
|
+
autoload :ResourceAccessor
|
20
20
|
|
21
21
|
const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
|
22
22
|
|
@@ -29,7 +29,7 @@ module Botz
|
|
29
29
|
::Botz::DefinitionFile.open(filepath)
|
30
30
|
end
|
31
31
|
|
32
|
-
def self.define(name, domain
|
32
|
+
def self.define(name, domain: nil, &block)
|
33
33
|
crawler_definition = Class.new(::Botz::Definition, &block)
|
34
34
|
crawler_definition.domain = domain
|
35
35
|
crawler_class_name = name.to_s.camelize
|
data/lib/botz/console.rb
CHANGED
@@ -18,16 +18,4 @@ class Botz::Console
|
|
18
18
|
def reload!
|
19
19
|
@definition_file&.eval_definition
|
20
20
|
end
|
21
|
-
|
22
|
-
def scraper(name, url, &block)
|
23
|
-
scrapers[name.to_sym].call(url, &block)
|
24
|
-
end
|
25
|
-
|
26
|
-
def spider(name, url = nil, &block)
|
27
|
-
if url
|
28
|
-
spiders[name.to_sym].call(url, &block)
|
29
|
-
else
|
30
|
-
spiders[name.to_sym].call(&block)
|
31
|
-
end
|
32
|
-
end
|
33
21
|
end
|
data/lib/botz/definition.rb
CHANGED
@@ -7,27 +7,26 @@ class Botz::Definition
|
|
7
7
|
class_attribute :domain
|
8
8
|
class_attribute :spiders, default: {}
|
9
9
|
class_attribute :scrapers, default: {}
|
10
|
-
|
11
|
-
Output = ->(result) { STDOUT.puts(result.to_json) }
|
10
|
+
class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
|
12
11
|
|
13
12
|
def output(&block)
|
14
|
-
|
15
|
-
const_set(:Output, block)
|
13
|
+
self.output = block
|
16
14
|
end
|
17
15
|
|
18
16
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
19
17
|
class << self
|
20
|
-
def scraper(name, as: :html, &block)
|
18
|
+
def scraper(name, encoding: nil, as: :html, &block)
|
21
19
|
class_name = "#{name}_scraper".classify
|
22
|
-
|
23
|
-
|
20
|
+
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
21
|
+
accessor = accessor_class.new(encoding: encoding)
|
22
|
+
binder_base = Botz::Scraper.const_get(as.to_s.classify)
|
24
23
|
binder = Class.new(binder_base, &block)
|
25
24
|
binder.define_singleton_method(:name) { class_name }
|
26
25
|
crawler_class = self
|
27
26
|
scraper_class = Class.new do
|
28
27
|
define_singleton_method(:crawler_class) { crawler_class }
|
29
28
|
define_singleton_method(:bind) do |url|
|
30
|
-
|
29
|
+
accessor.call(url) do |resource|
|
31
30
|
binder.new(scraper_class, resource)
|
32
31
|
end
|
33
32
|
end
|
@@ -37,12 +36,14 @@ class Botz::Definition
|
|
37
36
|
scrapers[name] = scraper_class
|
38
37
|
end
|
39
38
|
|
40
|
-
def spider(name, start_url = nil, as: :html, &block)
|
41
|
-
|
39
|
+
def spider(name, start_url = nil, encoding: nil, as: :html, &block)
|
40
|
+
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
41
|
+
accessor = accessor_class.new(start_url: start_url, encoding: encoding)
|
42
42
|
spider = Botz::Spider.new(&block)
|
43
43
|
spider_class = Class.new do
|
44
|
+
define_singleton_method(:accessor) { accessor }
|
44
45
|
define_singleton_method(:call) do |url = start_url, &spider_block|
|
45
|
-
|
46
|
+
accessor.call(url) do |resource|
|
46
47
|
spider.call(resource, &spider_block)
|
47
48
|
end
|
48
49
|
end
|
@@ -50,19 +51,6 @@ class Botz::Definition
|
|
50
51
|
const_set("#{name}_spider".classify, spider_class)
|
51
52
|
spiders[name] = spider_class
|
52
53
|
end
|
53
|
-
|
54
|
-
def before_context(url:)
|
55
|
-
downloader = Botz::Downloader.new(:html)
|
56
|
-
before_context_class = Class.new do
|
57
|
-
define_singleton_method(:call) do
|
58
|
-
downloader.call(url) do |page|
|
59
|
-
yield(page)
|
60
|
-
page.mech
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
const_set('before_context'.classify, before_context_class)
|
65
|
-
end
|
66
54
|
end
|
67
55
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
68
56
|
end
|
data/lib/botz/definition_file.rb
CHANGED
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Mechanize wrapper
|
5
|
+
#
|
6
|
+
class Botz::ResourceAccessor::Html
|
7
|
+
USER_AGENT = [
|
8
|
+
'Mozilla/5.0',
|
9
|
+
'(Macintosh; Intel Mac OS X 10_12_6)',
|
10
|
+
'AppleWebKit/537.36',
|
11
|
+
'(KHTML, like Gecko)',
|
12
|
+
'Chrome/64.0.3282.186',
|
13
|
+
'Safari/537.36'
|
14
|
+
].join(' ')
|
15
|
+
|
16
|
+
attr_reader :start_url
|
17
|
+
attr_reader :agent
|
18
|
+
|
19
|
+
def initialize(start_url: nil, encoding: nil)
|
20
|
+
@start_url = start_url
|
21
|
+
@agent = Mechanize.new
|
22
|
+
if encoding
|
23
|
+
@agent.default_encoding = encoding
|
24
|
+
@agent.force_default_encoding = true
|
25
|
+
end
|
26
|
+
@agent.user_agent = USER_AGENT
|
27
|
+
end
|
28
|
+
|
29
|
+
def call(url = @start_url, &block)
|
30
|
+
fail 'URL is undefined' if url.blank?
|
31
|
+
|
32
|
+
agent.get(url, &block)
|
33
|
+
end
|
34
|
+
end
|
data/lib/botz/scraper.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# direct resource to html scraping
|
5
|
+
#
|
6
|
+
class Botz::Scraper::DirectHtml
|
7
|
+
include ActiveModel::Model
|
8
|
+
include ActiveModel::Attributes
|
9
|
+
|
10
|
+
#
|
11
|
+
# Scraper error class
|
12
|
+
#
|
13
|
+
class Error < StandardError
|
14
|
+
def initialize(scraper_class, errors)
|
15
|
+
super("#{scraper_class} # #{errors.full_messages}")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :scraper_class
|
20
|
+
attr_reader :html
|
21
|
+
|
22
|
+
def initialize(scraper_class, resource)
|
23
|
+
@scraper_class = scraper_class
|
24
|
+
@html = resource
|
25
|
+
end
|
26
|
+
|
27
|
+
class << self
|
28
|
+
def field_names
|
29
|
+
@field_names ||= []
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_h
|
34
|
+
fetched_at = Time.current
|
35
|
+
fetched_on = fetched_at.beginning_of_day
|
36
|
+
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
37
|
+
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
38
|
+
end
|
39
|
+
|
40
|
+
def call
|
41
|
+
fail Error.new(scraper_class, errors) if invalid?
|
42
|
+
|
43
|
+
yield(to_h)
|
44
|
+
end
|
45
|
+
|
46
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
47
|
+
def self.field(name, path = nil, persist: true, &block)
|
48
|
+
if persist
|
49
|
+
field_names << name
|
50
|
+
case name
|
51
|
+
when /.*\?/
|
52
|
+
validates name, inclusion: { in: [true, false] }
|
53
|
+
else
|
54
|
+
validates name, presence: true, allow_blank: true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
59
|
+
return define_method(name) { html.search(path).text.strip } if block.nil?
|
60
|
+
|
61
|
+
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
62
|
+
end
|
63
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
64
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
# DSL for parsing html into objects
|
5
5
|
#
|
6
|
-
class Botz::
|
6
|
+
class Botz::Scraper::Html
|
7
7
|
include ActiveModel::Model
|
8
8
|
include ActiveModel::Attributes
|
9
9
|
|
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
|
|
24
24
|
@scraper_class = scraper_class
|
25
25
|
@url = resource.uri
|
26
26
|
@html = resource
|
27
|
-
@writer = writer
|
28
27
|
end
|
29
28
|
|
30
29
|
class << self
|
data/lib/botz/shell.rb
CHANGED
@@ -17,9 +17,9 @@ class Botz::Shell
|
|
17
17
|
while line = STDIN.gets
|
18
18
|
url = line.strip
|
19
19
|
begin
|
20
|
-
command.call(url, &definition_file.
|
21
|
-
rescue
|
22
|
-
STDERR.puts "ERROR #{
|
20
|
+
command.call(url, &definition_file.output)
|
21
|
+
rescue => e
|
22
|
+
STDERR.puts "ERROR #{e}"
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -47,4 +47,31 @@ class Botz::Shell
|
|
47
47
|
}
|
48
48
|
SHELL
|
49
49
|
end
|
50
|
+
|
51
|
+
# rubocop:disable Metrics/MethodLength
|
52
|
+
def build(name)
|
53
|
+
File.open("#{name}.rb", 'w') do |f|
|
54
|
+
f.write <<~RUBY
|
55
|
+
# frozen_string_literal: true
|
56
|
+
|
57
|
+
Botz.define(:#{name}) do
|
58
|
+
spider(:example, 'http://example.com') do |html, yielder|
|
59
|
+
# yielder.call(url or resource)
|
60
|
+
end
|
61
|
+
|
62
|
+
scraper(:example) do
|
63
|
+
end
|
64
|
+
end
|
65
|
+
RUBY
|
66
|
+
end
|
67
|
+
|
68
|
+
File.open("#{name}.sh", 'w') do |f|
|
69
|
+
f.write <<~SHELL
|
70
|
+
#!/bin/bash
|
71
|
+
eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
|
72
|
+
spider example
|
73
|
+
SHELL
|
74
|
+
end
|
75
|
+
end
|
76
|
+
# rubocop:enable Metrics/MethodLength
|
50
77
|
end
|
data/lib/botz/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: botz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -151,8 +151,12 @@ files:
|
|
151
151
|
- lib/botz/console.rb
|
152
152
|
- lib/botz/definition.rb
|
153
153
|
- lib/botz/definition_file.rb
|
154
|
-
- lib/botz/
|
155
|
-
- lib/botz/
|
154
|
+
- lib/botz/resource_accessor.rb
|
155
|
+
- lib/botz/resource_accessor/direct_html.rb
|
156
|
+
- lib/botz/resource_accessor/html.rb
|
157
|
+
- lib/botz/scraper.rb
|
158
|
+
- lib/botz/scraper/direct_html.rb
|
159
|
+
- lib/botz/scraper/html.rb
|
156
160
|
- lib/botz/shell.rb
|
157
161
|
- lib/botz/spider.rb
|
158
162
|
- lib/botz/version.rb
|
data/lib/botz/downloader.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# This class is responsible for actually making a network connection and downloading hypertext
|
5
|
-
#
|
6
|
-
class Botz::Downloader
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
USER_AGENT = [
|
10
|
-
'Mozilla/5.0',
|
11
|
-
'(Macintosh; Intel Mac OS X 10_12_6)',
|
12
|
-
'AppleWebKit/537.36',
|
13
|
-
'(KHTML, like Gecko)',
|
14
|
-
'Chrome/64.0.3282.186',
|
15
|
-
'Safari/537.36'
|
16
|
-
].join(' ')
|
17
|
-
|
18
|
-
html_loader = lambda do |ctx, url, block|
|
19
|
-
block_result = nil
|
20
|
-
agent = Mechanize.new
|
21
|
-
agent.user_agent = USER_AGENT
|
22
|
-
ctx&.call(agent)
|
23
|
-
agent.get(url) { |page| block_result = block.call(page) }
|
24
|
-
block_result
|
25
|
-
end
|
26
|
-
|
27
|
-
json_loader = lambda do |_ctx, url, block|
|
28
|
-
block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
|
29
|
-
end
|
30
|
-
|
31
|
-
class_attribute :loaders, default: { html: html_loader, json: json_loader }
|
32
|
-
|
33
|
-
attribute :context
|
34
|
-
attribute :loader
|
35
|
-
|
36
|
-
def initialize(name, context = nil)
|
37
|
-
super(loader: loaders[name], context: context)
|
38
|
-
end
|
39
|
-
|
40
|
-
def call(url, &block)
|
41
|
-
loader.call(context, url, block)
|
42
|
-
end
|
43
|
-
end
|