botz 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/bin/console +5 -3
- data/exe/botz +1 -0
- data/lib/botz.rb +3 -3
- data/lib/botz/console.rb +0 -12
- data/lib/botz/definition.rb +12 -24
- data/lib/botz/definition_file.rb +1 -1
- data/lib/botz/resource_accessor.rb +10 -0
- data/lib/botz/resource_accessor/direct_html.rb +14 -0
- data/lib/botz/resource_accessor/html.rb +34 -0
- data/lib/botz/scraper.rb +10 -0
- data/lib/botz/scraper/direct_html.rb +64 -0
- data/lib/botz/{html_scraper_macro.rb → scraper/html.rb} +1 -2
- data/lib/botz/shell.rb +30 -3
- data/lib/botz/version.rb +1 -1
- metadata +8 -4
- data/lib/botz/downloader.rb +0 -43
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
|
4
|
+
data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
|
7
|
+
data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
|
data/Gemfile.lock
CHANGED
data/bin/console
CHANGED
data/exe/botz
CHANGED
@@ -7,6 +7,7 @@ case ARGV[0]&.to_sym
|
|
7
7
|
when :spider then Botz.open(ARGV[1]).shell.spider(ARGV[2])
|
8
8
|
when :scraper then Botz.open(ARGV[1]).shell.scraper(ARGV[2])
|
9
9
|
when :shell then Botz.open(ARGV[1]).shell.function
|
10
|
+
when :new then Botz.open(ARGV[1]).shell.build
|
10
11
|
when :console
|
11
12
|
if ARGV[1].blank?
|
12
13
|
Botz.console
|
data/lib/botz.rb
CHANGED
@@ -15,8 +15,8 @@ module Botz
|
|
15
15
|
autoload :Definition
|
16
16
|
autoload :DefinitionFile
|
17
17
|
autoload :Spider
|
18
|
-
autoload :
|
19
|
-
autoload :
|
18
|
+
autoload :Scraper
|
19
|
+
autoload :ResourceAccessor
|
20
20
|
|
21
21
|
const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
|
22
22
|
|
@@ -29,7 +29,7 @@ module Botz
|
|
29
29
|
::Botz::DefinitionFile.open(filepath)
|
30
30
|
end
|
31
31
|
|
32
|
-
def self.define(name, domain
|
32
|
+
def self.define(name, domain: nil, &block)
|
33
33
|
crawler_definition = Class.new(::Botz::Definition, &block)
|
34
34
|
crawler_definition.domain = domain
|
35
35
|
crawler_class_name = name.to_s.camelize
|
data/lib/botz/console.rb
CHANGED
@@ -18,16 +18,4 @@ class Botz::Console
|
|
18
18
|
def reload!
|
19
19
|
@definition_file&.eval_definition
|
20
20
|
end
|
21
|
-
|
22
|
-
def scraper(name, url, &block)
|
23
|
-
scrapers[name.to_sym].call(url, &block)
|
24
|
-
end
|
25
|
-
|
26
|
-
def spider(name, url = nil, &block)
|
27
|
-
if url
|
28
|
-
spiders[name.to_sym].call(url, &block)
|
29
|
-
else
|
30
|
-
spiders[name.to_sym].call(&block)
|
31
|
-
end
|
32
|
-
end
|
33
21
|
end
|
data/lib/botz/definition.rb
CHANGED
@@ -7,27 +7,26 @@ class Botz::Definition
|
|
7
7
|
class_attribute :domain
|
8
8
|
class_attribute :spiders, default: {}
|
9
9
|
class_attribute :scrapers, default: {}
|
10
|
-
|
11
|
-
Output = ->(result) { STDOUT.puts(result.to_json) }
|
10
|
+
class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
|
12
11
|
|
13
12
|
def output(&block)
|
14
|
-
|
15
|
-
const_set(:Output, block)
|
13
|
+
self.output = block
|
16
14
|
end
|
17
15
|
|
18
16
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
19
17
|
class << self
|
20
|
-
def scraper(name, as: :html, &block)
|
18
|
+
def scraper(name, encoding: nil, as: :html, &block)
|
21
19
|
class_name = "#{name}_scraper".classify
|
22
|
-
|
23
|
-
|
20
|
+
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
21
|
+
accessor = accessor_class.new(encoding: encoding)
|
22
|
+
binder_base = Botz::Scraper.const_get(as.to_s.classify)
|
24
23
|
binder = Class.new(binder_base, &block)
|
25
24
|
binder.define_singleton_method(:name) { class_name }
|
26
25
|
crawler_class = self
|
27
26
|
scraper_class = Class.new do
|
28
27
|
define_singleton_method(:crawler_class) { crawler_class }
|
29
28
|
define_singleton_method(:bind) do |url|
|
30
|
-
|
29
|
+
accessor.call(url) do |resource|
|
31
30
|
binder.new(scraper_class, resource)
|
32
31
|
end
|
33
32
|
end
|
@@ -37,12 +36,14 @@ class Botz::Definition
|
|
37
36
|
scrapers[name] = scraper_class
|
38
37
|
end
|
39
38
|
|
40
|
-
def spider(name, start_url = nil, as: :html, &block)
|
41
|
-
|
39
|
+
def spider(name, start_url = nil, encoding: nil, as: :html, &block)
|
40
|
+
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
41
|
+
accessor = accessor_class.new(start_url: start_url, encoding: encoding)
|
42
42
|
spider = Botz::Spider.new(&block)
|
43
43
|
spider_class = Class.new do
|
44
|
+
define_singleton_method(:accessor) { accessor }
|
44
45
|
define_singleton_method(:call) do |url = start_url, &spider_block|
|
45
|
-
|
46
|
+
accessor.call(url) do |resource|
|
46
47
|
spider.call(resource, &spider_block)
|
47
48
|
end
|
48
49
|
end
|
@@ -50,19 +51,6 @@ class Botz::Definition
|
|
50
51
|
const_set("#{name}_spider".classify, spider_class)
|
51
52
|
spiders[name] = spider_class
|
52
53
|
end
|
53
|
-
|
54
|
-
def before_context(url:)
|
55
|
-
downloader = Botz::Downloader.new(:html)
|
56
|
-
before_context_class = Class.new do
|
57
|
-
define_singleton_method(:call) do
|
58
|
-
downloader.call(url) do |page|
|
59
|
-
yield(page)
|
60
|
-
page.mech
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
const_set('before_context'.classify, before_context_class)
|
65
|
-
end
|
66
54
|
end
|
67
55
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
68
56
|
end
|
data/lib/botz/definition_file.rb
CHANGED
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Mechanize wrapper
|
5
|
+
#
|
6
|
+
class Botz::ResourceAccessor::Html
|
7
|
+
USER_AGENT = [
|
8
|
+
'Mozilla/5.0',
|
9
|
+
'(Macintosh; Intel Mac OS X 10_12_6)',
|
10
|
+
'AppleWebKit/537.36',
|
11
|
+
'(KHTML, like Gecko)',
|
12
|
+
'Chrome/64.0.3282.186',
|
13
|
+
'Safari/537.36'
|
14
|
+
].join(' ')
|
15
|
+
|
16
|
+
attr_reader :start_url
|
17
|
+
attr_reader :agent
|
18
|
+
|
19
|
+
def initialize(start_url: nil, encoding: nil)
|
20
|
+
@start_url = start_url
|
21
|
+
@agent = Mechanize.new
|
22
|
+
if encoding
|
23
|
+
@agent.default_encoding = encoding
|
24
|
+
@agent.force_default_encoding = true
|
25
|
+
end
|
26
|
+
@agent.user_agent = USER_AGENT
|
27
|
+
end
|
28
|
+
|
29
|
+
def call(url = @start_url, &block)
|
30
|
+
fail 'URL is undefined' if url.blank?
|
31
|
+
|
32
|
+
agent.get(url, &block)
|
33
|
+
end
|
34
|
+
end
|
data/lib/botz/scraper.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# direct resource to html scraping
|
5
|
+
#
|
6
|
+
class Botz::Scraper::DirectHtml
|
7
|
+
include ActiveModel::Model
|
8
|
+
include ActiveModel::Attributes
|
9
|
+
|
10
|
+
#
|
11
|
+
# Scraper error class
|
12
|
+
#
|
13
|
+
class Error < StandardError
|
14
|
+
def initialize(scraper_class, errors)
|
15
|
+
super("#{scraper_class} # #{errors.full_messages}")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :scraper_class
|
20
|
+
attr_reader :html
|
21
|
+
|
22
|
+
def initialize(scraper_class, resource)
|
23
|
+
@scraper_class = scraper_class
|
24
|
+
@html = resource
|
25
|
+
end
|
26
|
+
|
27
|
+
class << self
|
28
|
+
def field_names
|
29
|
+
@field_names ||= []
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_h
|
34
|
+
fetched_at = Time.current
|
35
|
+
fetched_on = fetched_at.beginning_of_day
|
36
|
+
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
37
|
+
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
38
|
+
end
|
39
|
+
|
40
|
+
def call
|
41
|
+
fail Error.new(scraper_class, errors) if invalid?
|
42
|
+
|
43
|
+
yield(to_h)
|
44
|
+
end
|
45
|
+
|
46
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
47
|
+
def self.field(name, path = nil, persist: true, &block)
|
48
|
+
if persist
|
49
|
+
field_names << name
|
50
|
+
case name
|
51
|
+
when /.*\?/
|
52
|
+
validates name, inclusion: { in: [true, false] }
|
53
|
+
else
|
54
|
+
validates name, presence: true, allow_blank: true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
59
|
+
return define_method(name) { html.search(path).text.strip } if block.nil?
|
60
|
+
|
61
|
+
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
62
|
+
end
|
63
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
64
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
# DSL for parsing html into objects
|
5
5
|
#
|
6
|
-
class Botz::
|
6
|
+
class Botz::Scraper::Html
|
7
7
|
include ActiveModel::Model
|
8
8
|
include ActiveModel::Attributes
|
9
9
|
|
@@ -24,7 +24,6 @@ class Botz::HtmlScraperMacro
|
|
24
24
|
@scraper_class = scraper_class
|
25
25
|
@url = resource.uri
|
26
26
|
@html = resource
|
27
|
-
@writer = writer
|
28
27
|
end
|
29
28
|
|
30
29
|
class << self
|
data/lib/botz/shell.rb
CHANGED
@@ -17,9 +17,9 @@ class Botz::Shell
|
|
17
17
|
while line = STDIN.gets
|
18
18
|
url = line.strip
|
19
19
|
begin
|
20
|
-
command.call(url, &definition_file.
|
21
|
-
rescue
|
22
|
-
STDERR.puts "ERROR #{
|
20
|
+
command.call(url, &definition_file.output)
|
21
|
+
rescue => e
|
22
|
+
STDERR.puts "ERROR #{e}"
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -47,4 +47,31 @@ class Botz::Shell
|
|
47
47
|
}
|
48
48
|
SHELL
|
49
49
|
end
|
50
|
+
|
51
|
+
# rubocop:disable Metrics/MethodLength
|
52
|
+
def build(name)
|
53
|
+
File.open("#{name}.rb", 'w') do |f|
|
54
|
+
f.write <<~RUBY
|
55
|
+
# frozen_string_literal: true
|
56
|
+
|
57
|
+
Botz.define(:#{name}) do
|
58
|
+
spider(:example, 'http://example.com') do |html, yielder|
|
59
|
+
# yielder.call(url or resource)
|
60
|
+
end
|
61
|
+
|
62
|
+
scraper(:example) do
|
63
|
+
end
|
64
|
+
end
|
65
|
+
RUBY
|
66
|
+
end
|
67
|
+
|
68
|
+
File.open("#{name}.sh", 'w') do |f|
|
69
|
+
f.write <<~SHELL
|
70
|
+
#!/bin/bash
|
71
|
+
eval "$(botz $(dirname "${0}")/#{name}.rb shell)"
|
72
|
+
spider example
|
73
|
+
SHELL
|
74
|
+
end
|
75
|
+
end
|
76
|
+
# rubocop:enable Metrics/MethodLength
|
50
77
|
end
|
data/lib/botz/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: botz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -151,8 +151,12 @@ files:
|
|
151
151
|
- lib/botz/console.rb
|
152
152
|
- lib/botz/definition.rb
|
153
153
|
- lib/botz/definition_file.rb
|
154
|
-
- lib/botz/
|
155
|
-
- lib/botz/
|
154
|
+
- lib/botz/resource_accessor.rb
|
155
|
+
- lib/botz/resource_accessor/direct_html.rb
|
156
|
+
- lib/botz/resource_accessor/html.rb
|
157
|
+
- lib/botz/scraper.rb
|
158
|
+
- lib/botz/scraper/direct_html.rb
|
159
|
+
- lib/botz/scraper/html.rb
|
156
160
|
- lib/botz/shell.rb
|
157
161
|
- lib/botz/spider.rb
|
158
162
|
- lib/botz/version.rb
|
data/lib/botz/downloader.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# This class is responsible for actually making a network connection and downloading hypertext
|
5
|
-
#
|
6
|
-
class Botz::Downloader
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
USER_AGENT = [
|
10
|
-
'Mozilla/5.0',
|
11
|
-
'(Macintosh; Intel Mac OS X 10_12_6)',
|
12
|
-
'AppleWebKit/537.36',
|
13
|
-
'(KHTML, like Gecko)',
|
14
|
-
'Chrome/64.0.3282.186',
|
15
|
-
'Safari/537.36'
|
16
|
-
].join(' ')
|
17
|
-
|
18
|
-
html_loader = lambda do |ctx, url, block|
|
19
|
-
block_result = nil
|
20
|
-
agent = Mechanize.new
|
21
|
-
agent.user_agent = USER_AGENT
|
22
|
-
ctx&.call(agent)
|
23
|
-
agent.get(url) { |page| block_result = block.call(page) }
|
24
|
-
block_result
|
25
|
-
end
|
26
|
-
|
27
|
-
json_loader = lambda do |_ctx, url, block|
|
28
|
-
block.call JSON.parse(OpenURI.open_uri(url, 'User-Agent' => USER_AGENT))
|
29
|
-
end
|
30
|
-
|
31
|
-
class_attribute :loaders, default: { html: html_loader, json: json_loader }
|
32
|
-
|
33
|
-
attribute :context
|
34
|
-
attribute :loader
|
35
|
-
|
36
|
-
def initialize(name, context = nil)
|
37
|
-
super(loader: loaders[name], context: context)
|
38
|
-
end
|
39
|
-
|
40
|
-
def call(url, &block)
|
41
|
-
loader.call(context, url, block)
|
42
|
-
end
|
43
|
-
end
|