botz 0.4.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/botz/binder.rb +70 -0
- data/lib/botz/{resource_accessor → connector}/direct_html.rb +3 -1
- data/lib/botz/{resource_accessor → connector}/html.rb +15 -1
- data/lib/botz/{resource_accessor.rb → connector.rb} +1 -1
- data/lib/botz/definition.rb +73 -26
- data/lib/botz/looper.rb +22 -0
- data/lib/botz/result.rb +23 -0
- data/lib/botz/version.rb +1 -1
- data/lib/botz.rb +4 -2
- metadata +8 -8
- data/lib/botz/scraper/direct_html.rb +0 -64
- data/lib/botz/scraper/html.rb +0 -70
- data/lib/botz/scraper.rb +0 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8dc85ac7df0f64087255c2aa722ba3e6151baca615ce00d9ce3c7cebd022f231
|
4
|
+
data.tar.gz: 7f77369c8e744b050b2b08273b35f1e95e299d8f8adad3e7760e41f920a10af9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d7f5be597513d01464b23d6053d037b014008735745cb267d9299581efd8f11b1c5109929490b80cfbfee850c88202a3f0d298341a192cfe1f96212c72d8eee
|
7
|
+
data.tar.gz: 2f885e634ed4fe24ebd642bd65cf7c7c059eb1ec31d7a4e842cec0663527ba1da8366f719ded80a9e43346de8aa98e6f23564cde7f8e01265684bd44fbf5d5b5
|
data/Gemfile.lock
CHANGED
data/lib/botz/binder.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Bind resource received from the connection to the result object
|
5
|
+
#
|
6
|
+
class Botz::Binder
|
7
|
+
class_attribute :field_names, default: []
|
8
|
+
attr_reader :resource
|
9
|
+
|
10
|
+
#
|
11
|
+
# binding multiple
|
12
|
+
#
|
13
|
+
class Multiple
|
14
|
+
def self.bind(connector:, binder:, query:, block:)
|
15
|
+
multiple_binding_class = self
|
16
|
+
connector.field(binder, query) do |elements|
|
17
|
+
multiple_binding_class.new(binder.class).instance_exec(elements, &block)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(binder)
|
22
|
+
@binder = binder
|
23
|
+
end
|
24
|
+
|
25
|
+
def field(name)
|
26
|
+
@binder.field_names << name
|
27
|
+
@binder.field_names.uniq!
|
28
|
+
@binder.result_class.define(name)
|
29
|
+
result = yield
|
30
|
+
@binder.define_method(name) { result }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(resource)
|
35
|
+
@resource = resource
|
36
|
+
self.class.fields_call(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def result
|
40
|
+
new_result(field_names.map { |field_name| [field_name, send(field_name)] }.to_h)
|
41
|
+
end
|
42
|
+
|
43
|
+
def new_result(values)
|
44
|
+
fetched_at = Time.current
|
45
|
+
self.class.result_class.new(fetched_at: fetched_at, fetched_on: fetched_at.beginning_of_day, **values)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.query(name, query = nil, &block)
|
49
|
+
define_method(name) do
|
50
|
+
connector.field(self, query, &block)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.field(name, query = nil, &block)
|
55
|
+
field_names << name
|
56
|
+
field_names.uniq!
|
57
|
+
result_class.define(name)
|
58
|
+
define_method(name) do
|
59
|
+
connector.field(self, query, &block)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.fields(query, &block)
|
64
|
+
@fields = { query: query, block: block }
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.fields_call(binder)
|
68
|
+
Multiple.bind(connector: connector, binder: binder, query: @fields[:query], block: @fields[:block]) if @fields
|
69
|
+
end
|
70
|
+
end
|
@@ -3,7 +3,21 @@
|
|
3
3
|
#
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
|
-
class Botz::
|
6
|
+
class Botz::Connector::Html
|
7
|
+
#
|
8
|
+
# field macro
|
9
|
+
#
|
10
|
+
module Field
|
11
|
+
def field(object, query, &block)
|
12
|
+
node = object.resource.search(query)
|
13
|
+
fail "Could not be located #{query}" if node.nil?
|
14
|
+
return node.first.text if block.nil?
|
15
|
+
|
16
|
+
object.instance_exec(node, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
include Field
|
20
|
+
|
7
21
|
USER_AGENT = [
|
8
22
|
'Mozilla/5.0',
|
9
23
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
data/lib/botz/definition.rb
CHANGED
@@ -7,7 +7,7 @@ class Botz::Definition
|
|
7
7
|
class_attribute :domain
|
8
8
|
class_attribute :spiders, default: {}
|
9
9
|
class_attribute :scrapers, default: {}
|
10
|
-
class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
|
10
|
+
class_attribute :output, default: ->(result) { STDOUT.puts(result.attributes.to_json) }
|
11
11
|
|
12
12
|
def output(&block)
|
13
13
|
self.output = block
|
@@ -15,35 +15,14 @@ class Botz::Definition
|
|
15
15
|
|
16
16
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
17
17
|
class << self
|
18
|
-
def scraper(name, encoding: nil, as: :html, &block)
|
19
|
-
class_name = "#{name}_scraper".classify
|
20
|
-
accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
|
21
|
-
accessor = accessor_class.new(encoding: encoding)
|
22
|
-
binder_base = Botz::Scraper.const_get(as.to_s.classify)
|
23
|
-
binder = Class.new(binder_base, &block)
|
24
|
-
binder.define_singleton_method(:name) { class_name }
|
25
|
-
crawler_class = self
|
26
|
-
scraper_class = Class.new do
|
27
|
-
define_singleton_method(:crawler_class) { crawler_class }
|
28
|
-
define_singleton_method(:bind) do |url|
|
29
|
-
accessor.call(url) do |resource|
|
30
|
-
binder.new(scraper_class, resource)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
define_singleton_method(:call) { |url, &output| bind(url).call(&output) }
|
34
|
-
end
|
35
|
-
const_set(class_name, scraper_class)
|
36
|
-
scrapers[name] = scraper_class
|
37
|
-
end
|
38
|
-
|
39
18
|
def spider(name, start_url = nil, encoding: nil, as: :html, &block)
|
40
|
-
|
41
|
-
|
19
|
+
connector_class = Botz::Connector.const_get(as.to_s.classify)
|
20
|
+
connector = connector_class.new(start_url: start_url, encoding: encoding)
|
42
21
|
spider = Botz::Spider.new(&block)
|
43
22
|
spider_class = Class.new do
|
44
|
-
define_singleton_method(:
|
23
|
+
define_singleton_method(:connector) { connector }
|
45
24
|
define_singleton_method(:call) do |url = start_url, &spider_block|
|
46
|
-
|
25
|
+
connector.call(url) do |resource|
|
47
26
|
spider.call(resource, &spider_block)
|
48
27
|
end
|
49
28
|
end
|
@@ -51,6 +30,74 @@ class Botz::Definition
|
|
51
30
|
const_set("#{name}_spider".classify, spider_class)
|
52
31
|
spiders[name] = spider_class
|
53
32
|
end
|
33
|
+
|
34
|
+
def scraper(name, options, &block)
|
35
|
+
if options[:loop]
|
36
|
+
loop_scraper(name, options, &block)
|
37
|
+
else
|
38
|
+
normal_scraper(name, **options, &block)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def loop_scraper(name, options, &block)
|
45
|
+
options = { as: :html, start_url: nil, encoding: nil, loop: nil }.merge(options)
|
46
|
+
result_class = Class.new(Botz::Result)
|
47
|
+
|
48
|
+
# connector
|
49
|
+
connector_class = Botz::Connector.const_get(options[:as].to_s.classify)
|
50
|
+
connector = connector_class.new(encoding: options[:encoding])
|
51
|
+
|
52
|
+
namespace = Class.new do
|
53
|
+
binder = Class.new(Botz::Binder) do
|
54
|
+
define_singleton_method(:connector) { connector }
|
55
|
+
define_singleton_method(:result_class) { result_class }
|
56
|
+
define_method(:connector) { connector }
|
57
|
+
instance_exec(&block)
|
58
|
+
end
|
59
|
+
define_singleton_method(:call) do |url = options[:start_url], &yielder|
|
60
|
+
connector.call(url) do |resource|
|
61
|
+
looper = Botz::Looper.new(resource, binder, options[:loop])
|
62
|
+
looper.call(&yielder)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
const_set("#{name}_scraper".classify, namespace)
|
67
|
+
scrapers[name] = namespace
|
68
|
+
end
|
69
|
+
|
70
|
+
def normal_scraper(name, encoding: nil, as: :html, &block)
|
71
|
+
# result
|
72
|
+
result_class = Class.new(Botz::Result)
|
73
|
+
|
74
|
+
# connector
|
75
|
+
connector_class = Botz::Connector.const_get(as.to_s.classify)
|
76
|
+
connector = connector_class.new(encoding: encoding)
|
77
|
+
|
78
|
+
# namespace
|
79
|
+
namespace = Class.new do
|
80
|
+
binder = Class.new(Botz::Binder) do
|
81
|
+
define_singleton_method(:connector) { connector }
|
82
|
+
define_singleton_method(:result_class) { result_class }
|
83
|
+
define_method(:connector) { connector }
|
84
|
+
instance_exec(&block)
|
85
|
+
end
|
86
|
+
define_singleton_method(:bind) do |url|
|
87
|
+
connector.call(url) do |resource|
|
88
|
+
binder.new(resource)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
define_singleton_method(:call) do |url, &output|
|
92
|
+
result = bind(url).result
|
93
|
+
fail "#{url}\n#{result.errors.full_messages}" if result.invalid?
|
94
|
+
|
95
|
+
output.call(result)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
const_set("#{name}_scraper".classify, namespace)
|
99
|
+
scrapers[name] = namespace
|
100
|
+
end
|
54
101
|
end
|
55
102
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
56
103
|
end
|
data/lib/botz/looper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# looper
|
5
|
+
#
|
6
|
+
class Botz::Looper
|
7
|
+
def initialize(resource, binder, loop_block)
|
8
|
+
@resource = resource
|
9
|
+
@binder = binder
|
10
|
+
@loop_block = loop_block
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
yielder = lambda do |element|
|
15
|
+
result = @binder.new(element).result
|
16
|
+
fail "#{element}\n\n#{result.errors.full_messages}" if result.invalid?
|
17
|
+
|
18
|
+
yield result
|
19
|
+
end
|
20
|
+
@loop_block.call(@resource, yielder)
|
21
|
+
end
|
22
|
+
end
|
data/lib/botz/result.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Scrape results
|
5
|
+
#
|
6
|
+
class Botz::Result
|
7
|
+
include ActiveModel::Model
|
8
|
+
include ActiveModel::Attributes
|
9
|
+
|
10
|
+
def self.define(name)
|
11
|
+
case name
|
12
|
+
when /.*\?/
|
13
|
+
attribute name, :boolean
|
14
|
+
validates name, inclusion: { in: [true, false] }
|
15
|
+
else
|
16
|
+
attribute name
|
17
|
+
validates name, presence: true, allow_blank: true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
attribute :fetched_at
|
22
|
+
attribute :fetched_on
|
23
|
+
end
|
data/lib/botz/version.rb
CHANGED
data/lib/botz.rb
CHANGED
@@ -14,9 +14,11 @@ module Botz
|
|
14
14
|
autoload :Console
|
15
15
|
autoload :Definition
|
16
16
|
autoload :DefinitionFile
|
17
|
+
autoload :Binder
|
17
18
|
autoload :Spider
|
18
|
-
autoload :
|
19
|
-
autoload :
|
19
|
+
autoload :Looper
|
20
|
+
autoload :Connector
|
21
|
+
autoload :Result
|
20
22
|
|
21
23
|
const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
|
22
24
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: botz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -148,15 +148,15 @@ files:
|
|
148
148
|
- botz.gemspec
|
149
149
|
- exe/botz
|
150
150
|
- lib/botz.rb
|
151
|
+
- lib/botz/binder.rb
|
152
|
+
- lib/botz/connector.rb
|
153
|
+
- lib/botz/connector/direct_html.rb
|
154
|
+
- lib/botz/connector/html.rb
|
151
155
|
- lib/botz/console.rb
|
152
156
|
- lib/botz/definition.rb
|
153
157
|
- lib/botz/definition_file.rb
|
154
|
-
- lib/botz/
|
155
|
-
- lib/botz/
|
156
|
-
- lib/botz/resource_accessor/html.rb
|
157
|
-
- lib/botz/scraper.rb
|
158
|
-
- lib/botz/scraper/direct_html.rb
|
159
|
-
- lib/botz/scraper/html.rb
|
158
|
+
- lib/botz/looper.rb
|
159
|
+
- lib/botz/result.rb
|
160
160
|
- lib/botz/shell.rb
|
161
161
|
- lib/botz/spider.rb
|
162
162
|
- lib/botz/version.rb
|
@@ -1,64 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# direct resource to html scraping
|
5
|
-
#
|
6
|
-
class Botz::Scraper::DirectHtml
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
|
10
|
-
#
|
11
|
-
# Scraper error class
|
12
|
-
#
|
13
|
-
class Error < StandardError
|
14
|
-
def initialize(scraper_class, errors)
|
15
|
-
super("#{scraper_class} # #{errors.full_messages}")
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
attr_reader :scraper_class
|
20
|
-
attr_reader :html
|
21
|
-
|
22
|
-
def initialize(scraper_class, resource)
|
23
|
-
@scraper_class = scraper_class
|
24
|
-
@html = resource
|
25
|
-
end
|
26
|
-
|
27
|
-
class << self
|
28
|
-
def field_names
|
29
|
-
@field_names ||= []
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def to_h
|
34
|
-
fetched_at = Time.current
|
35
|
-
fetched_on = fetched_at.beginning_of_day
|
36
|
-
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
37
|
-
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
38
|
-
end
|
39
|
-
|
40
|
-
def call
|
41
|
-
fail Error.new(scraper_class, errors) if invalid?
|
42
|
-
|
43
|
-
yield(to_h)
|
44
|
-
end
|
45
|
-
|
46
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
47
|
-
def self.field(name, path = nil, persist: true, &block)
|
48
|
-
if persist
|
49
|
-
field_names << name
|
50
|
-
case name
|
51
|
-
when /.*\?/
|
52
|
-
validates name, inclusion: { in: [true, false] }
|
53
|
-
else
|
54
|
-
validates name, presence: true, allow_blank: true
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
59
|
-
return define_method(name) { html.search(path).text.strip } if block.nil?
|
60
|
-
|
61
|
-
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
62
|
-
end
|
63
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
64
|
-
end
|
data/lib/botz/scraper/html.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
#
|
4
|
-
# DSL for parsing html into objects
|
5
|
-
#
|
6
|
-
class Botz::Scraper::Html
|
7
|
-
include ActiveModel::Model
|
8
|
-
include ActiveModel::Attributes
|
9
|
-
|
10
|
-
#
|
11
|
-
# Scraper error class
|
12
|
-
#
|
13
|
-
class Error < StandardError
|
14
|
-
def initialize(scraper_class, url, errors)
|
15
|
-
super("#{scraper_class}(#{url}) # #{errors.full_messages}")
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
attr_reader :scraper_class
|
20
|
-
attr_reader :url
|
21
|
-
attr_reader :html
|
22
|
-
|
23
|
-
def initialize(scraper_class, resource)
|
24
|
-
@scraper_class = scraper_class
|
25
|
-
@url = resource.uri
|
26
|
-
@html = resource
|
27
|
-
end
|
28
|
-
|
29
|
-
class << self
|
30
|
-
def field_names
|
31
|
-
@field_names ||= []
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def primary_key
|
36
|
-
url.to_s
|
37
|
-
end
|
38
|
-
|
39
|
-
def to_h
|
40
|
-
fetched_at = Time.current
|
41
|
-
fetched_on = fetched_at.beginning_of_day
|
42
|
-
timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
|
43
|
-
self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
|
44
|
-
end
|
45
|
-
|
46
|
-
def call
|
47
|
-
fail Error.new(scraper_class, url, errors) if invalid?
|
48
|
-
|
49
|
-
yield(to_h)
|
50
|
-
end
|
51
|
-
|
52
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
53
|
-
def self.field(name, path = nil, persist: true, &block)
|
54
|
-
if persist
|
55
|
-
field_names << name
|
56
|
-
case name
|
57
|
-
when /.*\?/
|
58
|
-
validates name, inclusion: { in: [true, false] }
|
59
|
-
else
|
60
|
-
validates name, presence: true, allow_blank: true
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
return define_method(name) { instance_exec(html, &block) } if path.nil?
|
65
|
-
return define_method(name) { html.search(path).text.strip } if block.nil?
|
66
|
-
|
67
|
-
define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
|
68
|
-
end
|
69
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
70
|
-
end
|