botz 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/botz/binder.rb +70 -0
- data/lib/botz/{resource_accessor → connector}/direct_html.rb +3 -1
- data/lib/botz/{resource_accessor → connector}/html.rb +15 -1
- data/lib/botz/{resource_accessor.rb → connector.rb} +1 -1
- data/lib/botz/definition.rb +73 -26
- data/lib/botz/looper.rb +22 -0
- data/lib/botz/result.rb +23 -0
- data/lib/botz/version.rb +1 -1
- data/lib/botz.rb +4 -2
- metadata +8 -8
- data/lib/botz/scraper/direct_html.rb +0 -64
- data/lib/botz/scraper/html.rb +0 -70
- data/lib/botz/scraper.rb +0 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8dc85ac7df0f64087255c2aa722ba3e6151baca615ce00d9ce3c7cebd022f231
|
4
|
+
data.tar.gz: 7f77369c8e744b050b2b08273b35f1e95e299d8f8adad3e7760e41f920a10af9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d7f5be597513d01464b23d6053d037b014008735745cb267d9299581efd8f11b1c5109929490b80cfbfee850c88202a3f0d298341a192cfe1f96212c72d8eee
|
7
|
+
data.tar.gz: 2f885e634ed4fe24ebd642bd65cf7c7c059eb1ec31d7a4e842cec0663527ba1da8366f719ded80a9e43346de8aa98e6f23564cde7f8e01265684bd44fbf5d5b5
|
data/Gemfile.lock
CHANGED
data/lib/botz/binder.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Bind resource received from the connection to the result object
|
5
|
+
#
|
6
|
+
class Botz::Binder
|
7
|
+
class_attribute :field_names, default: []
|
8
|
+
attr_reader :resource
|
9
|
+
|
10
|
+
#
|
11
|
+
# binding multiple
|
12
|
+
#
|
13
|
+
class Multiple
|
14
|
+
def self.bind(connector:, binder:, query:, block:)
|
15
|
+
multiple_binding_class = self
|
16
|
+
connector.field(binder, query) do |elements|
|
17
|
+
multiple_binding_class.new(binder.class).instance_exec(elements, &block)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(binder)
|
22
|
+
@binder = binder
|
23
|
+
end
|
24
|
+
|
25
|
+
def field(name)
|
26
|
+
@binder.field_names << name
|
27
|
+
@binder.field_names.uniq!
|
28
|
+
@binder.result_class.define(name)
|
29
|
+
result = yield
|
30
|
+
@binder.define_method(name) { result }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(resource)
|
35
|
+
@resource = resource
|
36
|
+
self.class.fields_call(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def result
|
40
|
+
new_result(field_names.map { |field_name| [field_name, send(field_name)] }.to_h)
|
41
|
+
end
|
42
|
+
|
43
|
+
def new_result(values)
|
44
|
+
fetched_at = Time.current
|
45
|
+
self.class.result_class.new(fetched_at: fetched_at, fetched_on: fetched_at.beginning_of_day, **values)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.query(name, query = nil, &block)
|
49
|
+
define_method(name) do
|
50
|
+
connector.field(self, query, &block)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.field(name, query = nil, &block)
|
55
|
+
field_names << name
|
56
|
+
field_names.uniq!
|
57
|
+
result_class.define(name)
|
58
|
+
define_method(name) do
|
59
|
+
connector.field(self, query, &block)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.fields(query, &block)
|
64
|
+
@fields = { query: query, block: block }
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.fields_call(binder)
|
68
|
+
Multiple.bind(connector: connector, binder: binder, query: @fields[:query], block: @fields[:block]) if @fields
|
69
|
+
end
|
70
|
+
end
|
@@ -3,7 +3,21 @@
|
|
3
3
|
#
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
|
-
class Botz::
|
6
|
+
class Botz::Connector::Html
|
7
|
+
#
|
8
|
+
# field macro
|
9
|
+
#
|
10
|
+
module Field
|
11
|
+
def field(object, query, &block)
|
12
|
+
node = object.resource.search(query)
|
13
|
+
fail "Could not be located #{query}" if node.nil?
|
14
|
+
return node.first.text if block.nil?
|
15
|
+
|
16
|
+
object.instance_exec(node, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
include Field
|
20
|
+
|
7
21
|
USER_AGENT = [
|
8
22
|
'Mozilla/5.0',
|
9
23
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
data/lib/botz/definition.rb
CHANGED
@@ -7,7 +7,7 @@ class Botz::Definition
|
|
7
7
|
class_attribute :domain
|
8
8
|
class_attribute :spiders, default: {}
|
9
9
|
class_attribute :scrapers, default: {}
|
10
|
-
class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
|
10
|
+
class_attribute :output, default: ->(result) { STDOUT.puts(result.attributes.to_json) }
|
11
11
|
|
12
12
|
def output(&block)
|
13
13
|
self.output = block
|
@@ -15,35 +15,14 @@ class Botz::Definition
|
|
15
15
|
|
16
16
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
17
17
|
class << self
|
18
|
-
def scraper(name, encoding: nil, as: :html, &block)
|
19
|
-
class_name = "#{name}_scraper".classify
|
20
|
-
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
21
|
-
accessor = accessor_class.new(encoding: encoding)
|
22
|
-
binder_base = Botz::Scraper.const_get(as.to_s.classify)
|
23
|
-
binder = Class.new(binder_base, &block)
|
24
|
-
binder.define_singleton_method(:name) { class_name }
|
25
|
-
crawler_class = self
|
26
|
-
scraper_class = Class.new do
|
27
|
-
define_singleton_method(:crawler_class) { crawler_class }
|
28
|
-
define_singleton_method(:bind) do |url|
|
29
|
-
accessor.call(url) do |resource|
|
30
|
-
binder.new(scraper_class, resource)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
define_singleton_method(:call) { |url, &output| bind(url).call(&output) }
|
34
|
-
end
|
35
|
-
const_set(class_name, scraper_class)
|
36
|
-
scrapers[name] = scraper_class
|
37
|
-
end
|
38
|
-
|
39
18
|
def spider(name, start_url = nil, encoding: nil, as: :html, &block)
|
40
|
-
|
41
|
-
|
19
|
+
connector_class = Botz::Connector.const_get(as.to_s.classify)
|
20
|
+
connector = connector_class.new(start_url: start_url, encoding: encoding)
|
42
21
|
spider = Botz::Spider.new(&block)
|
43
22
|
spider_class = Class.new do
|
44
|
-
define_singleton_method(:
|
23
|
+
define_singleton_method(:connector) { connector }
|
45
24
|
define_singleton_method(:call) do |url = start_url, &spider_block|
|
46
|
-
|
25
|
+
connector.call(url) do |resource|
|
47
26
|
spider.call(resource, &spider_block)
|
48
27
|
end
|
49
28
|
end
|
@@ -51,6 +30,74 @@ class Botz::Definition
|
|
51
30
|
const_set("#{name}_spider".classify, spider_class)
|
52
31
|
spiders[name] = spider_class
|
53
32
|
end
|
33
|
+
|
34
|
+
def scraper(name, options, &block)
|
35
|
+
if options[:loop]
|
36
|
+
loop_scraper(name, options, &block)
|
37
|
+
else
|
38
|
+
normal_scraper(name, **options, &block)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def loop_scraper(name, options, &block)
|
45
|
+
options = { as: :html, start_url: nil, encoding: nil, loop: nil }.merge(options)
|
46
|
+
result_class = Class.new(Botz::Result)
|
47
|
+
|
48
|
+
# connector
|
49
|
+
connector_class = Botz::Connector.const_get(options[:as].to_s.classify)
|
50
|
+
connector = connector_class.new(encoding: options[:encoding])
|
51
|
+
|
52
|
+
namespace = Class.new do
|
53
|
+
binder = Class.new(Botz::Binder) do
|
54
|
+
define_singleton_method(:connector) { connector }
|
55
|
+
define_singleton_method(:result_class) { result_class }
|
56
|
+
define_method(:connector) { connector }
|
57
|
+
instance_exec(&block)
|
58
|
+
end
|
59
|
+
define_singleton_method(:call) do |url = options[:start_url], &yielder|
|
60
|
+
connector.call(url) do |resource|
|
61
|
+
looper = Botz::Looper.new(resource, binder, options[:loop])
|
62
|
+
looper.call(&yielder)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
const_set("#{name}_scraper".classify, namespace)
|
67
|
+
scrapers[name] = namespace
|
68
|
+
end
|
69
|
+
|
70
|
+
def normal_scraper(name, encoding: nil, as: :html, &block)
|
71
|
+
# result
|
72
|
+
result_class = Class.new(Botz::Result)
|
73
|
+
|
74
|
+
# connector
|
75
|
+
connector_class = Botz::Connector.const_get(as.to_s.classify)
|
76
|
+
connector = connector_class.new(encoding: encoding)
|
77
|
+
|
78
|
+
# namespace
|
79
|
+
namespace = Class.new do
|
80
|
+
binder = Class.new(Botz::Binder) do
|
81
|
+
define_singleton_method(:connector) { connector }
|
82
|
+
define_singleton_method(:result_class) { result_class }
|
83
|
+
define_method(:connector) { connector }
|
84
|
+
instance_exec(&block)
|
85
|
+
end
|
86
|
+
define_singleton_method(:bind) do |url|
|
87
|
+
connector.call(url) do |resource|
|
88
|
+
binder.new(resource)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
define_singleton_method(:call) do |url, &output|
|
92
|
+
result = bind(url).result
|
93
|
+
fail "#{url}\n#{result.errors.full_messages}" if result.invalid?
|
94
|
+
|
95
|
+
output.call(result)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
const_set("#{name}_scraper".classify, namespace)
|
99
|
+
scrapers[name] = namespace
|
100
|
+
end
|
54
101
|
end
|
55
102
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
56
103
|
end
|
data/lib/botz/looper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# looper
|
5
|
+
#
|
6
|
+
class Botz::Looper
|
7
|
+
def initialize(resource, binder, loop_block)
|
8
|
+
@resource = resource
|
9
|
+
@binder = binder
|
10
|
+
@loop_block = loop_block
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
yielder = lambda do |element|
|
15
|
+
result = @binder.new(element).result
|
16
|
+
fail "#{element}\n\n#{result.errors.full_messages}" if result.invalid?
|
17
|
+
|
18
|
+
yield result
|
19
|
+
end
|
20
|
+
@loop_block.call(@resource, yielder)
|
21
|
+
end
|
22
|
+
end
|
data/lib/botz/result.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Scrape results
|
5
|
+
#
|
6
|
+
class Botz::Result
|
7
|
+
include ActiveModel::Model
|
8
|
+
include ActiveModel::Attributes
|
9
|
+
|
10
|
+
def self.define(name)
|
11
|
+
case name
|
12
|
+
when /.*\?/
|
13
|
+
attribute name, :boolean
|
14
|
+
validates name, inclusion: { in: [true, false] }
|
15
|
+
else
|
16
|
+
attribute name
|
17
|
+
validates name, presence: true, allow_blank: true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
attribute :fetched_at
|
22
|
+
attribute :fetched_on
|
23
|
+
end
|
data/lib/botz/version.rb
CHANGED
data/lib/botz.rb
CHANGED
@@ -14,9 +14,11 @@ module Botz
|
|
14
14
|
autoload :Console
|
15
15
|
autoload :Definition
|
16
16
|
autoload :DefinitionFile
|
17
|
+
autoload :Binder
|
17
18
|
autoload :Spider
|
18
|
-
autoload :
|
19
|
-
autoload :
|
19
|
+
autoload :Looper
|
20
|
+
autoload :Connector
|
21
|
+
autoload :Result
|
20
22
|
|
21
23
|
const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
|
22
24
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: botz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -148,15 +148,15 @@ files:
|
|
148
148
|
- botz.gemspec
|
149
149
|
- exe/botz
|
150
150
|
- lib/botz.rb
|
151
|
+
- lib/botz/binder.rb
|
152
|
+
- lib/botz/connector.rb
|
153
|
+
- lib/botz/connector/direct_html.rb
|
154
|
+
- lib/botz/connector/html.rb
|
151
155
|
- lib/botz/console.rb
|
152
156
|
- lib/botz/definition.rb
|
153
157
|
- lib/botz/definition_file.rb
|
154
|
-
- lib/botz/
|
155
|
-
- lib/botz/
|
156
|
-
- lib/botz/resource_accessor/html.rb
|
157
|
-
- lib/botz/scraper.rb
|
158
|
-
- lib/botz/scraper/direct_html.rb
|
159
|
-
- lib/botz/scraper/html.rb
|
158
|
+
- lib/botz/looper.rb
|
159
|
+
- lib/botz/result.rb
|
160
160
|
- lib/botz/shell.rb
|
161
161
|
- lib/botz/spider.rb
|
162
162
|
- lib/botz/version.rb
|
@@ -1,64 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# direct resource to html scraping
|
5
|
-
#
|
6
|
-
class Botz::Scraper::DirectHtml
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
|
10
|
-
#
|
11
|
-
# Scraper error class
|
12
|
-
#
|
13
|
-
class Error < StandardError
|
14
|
-
def initialize(scraper_class, errors)
|
15
|
-
super("#{scraper_class} # #{errors.full_messages}")
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
attr_reader :scraper_class
|
20
|
-
attr_reader :html
|
21
|
-
|
22
|
-
def initialize(scraper_class, resource)
|
23
|
-
@scraper_class = scraper_class
|
24
|
-
@html = resource
|
25
|
-
end
|
26
|
-
|
27
|
-
class << self
|
28
|
-
def field_names
|
29
|
-
@field_names ||= []
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def to_h
|
34
|
-
fetched_at = Time.current
|
35
|
-
fetched_on = fetched_at.beginning_of_day
|
36
|
-
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
37
|
-
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
38
|
-
end
|
39
|
-
|
40
|
-
def call
|
41
|
-
fail Error.new(scraper_class, errors) if invalid?
|
42
|
-
|
43
|
-
yield(to_h)
|
44
|
-
end
|
45
|
-
|
46
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
47
|
-
def self.field(name, path = nil, persist: true, &block)
|
48
|
-
if persist
|
49
|
-
field_names << name
|
50
|
-
case name
|
51
|
-
when /.*\?/
|
52
|
-
validates name, inclusion: { in: [true, false] }
|
53
|
-
else
|
54
|
-
validates name, presence: true, allow_blank: true
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
59
|
-
return define_method(name) { html.search(path).text.strip } if block.nil?
|
60
|
-
|
61
|
-
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
62
|
-
end
|
63
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
64
|
-
end
|
data/lib/botz/scraper/html.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# DSL for parsing html into objects
|
5
|
-
#
|
6
|
-
class Botz::Scraper::Html
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
|
10
|
-
#
|
11
|
-
# Scraper error class
|
12
|
-
#
|
13
|
-
class Error < StandardError
|
14
|
-
def initialize(scraper_class, url, errors)
|
15
|
-
super("#{scraper_class}(#{url}) # #{errors.full_messages}")
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
attr_reader :scraper_class
|
20
|
-
attr_reader :url
|
21
|
-
attr_reader :html
|
22
|
-
|
23
|
-
def initialize(scraper_class, resource)
|
24
|
-
@scraper_class = scraper_class
|
25
|
-
@url = resource.uri
|
26
|
-
@html = resource
|
27
|
-
end
|
28
|
-
|
29
|
-
class << self
|
30
|
-
def field_names
|
31
|
-
@field_names ||= []
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def primary_key
|
36
|
-
url.to_s
|
37
|
-
end
|
38
|
-
|
39
|
-
def to_h
|
40
|
-
fetched_at = Time.current
|
41
|
-
fetched_on = fetched_at.beginning_of_day
|
42
|
-
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
43
|
-
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
44
|
-
end
|
45
|
-
|
46
|
-
def call
|
47
|
-
fail Error.new(scraper_class, url, errors) if invalid?
|
48
|
-
|
49
|
-
yield(to_h)
|
50
|
-
end
|
51
|
-
|
52
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
53
|
-
def self.field(name, path = nil, persist: true, &block)
|
54
|
-
if persist
|
55
|
-
field_names << name
|
56
|
-
case name
|
57
|
-
when /.*\?/
|
58
|
-
validates name, inclusion: { in: [true, false] }
|
59
|
-
else
|
60
|
-
validates name, presence: true, allow_blank: true
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
65
|
-
return define_method(name) { html.search(path).text.strip } if block.nil?
|
66
|
-
|
67
|
-
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
68
|
-
end
|
69
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
70
|
-
end
|