botz 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
4
- data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
3
+ metadata.gz: 8dc85ac7df0f64087255c2aa722ba3e6151baca615ce00d9ce3c7cebd022f231
4
+ data.tar.gz: 7f77369c8e744b050b2b08273b35f1e95e299d8f8adad3e7760e41f920a10af9
5
5
  SHA512:
6
- metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
7
- data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
6
+ metadata.gz: 4d7f5be597513d01464b23d6053d037b014008735745cb267d9299581efd8f11b1c5109929490b80cfbfee850c88202a3f0d298341a192cfe1f96212c72d8eee
7
+ data.tar.gz: 2f885e634ed4fe24ebd642bd65cf7c7c059eb1ec31d7a4e842cec0663527ba1da8366f719ded80a9e43346de8aa98e6f23564cde7f8e01265684bd44fbf5d5b5
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- botz (0.4.0)
4
+ botz (0.5.0)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Bind resource received from the connection to the result object
5
+ #
6
+ class Botz::Binder
7
+ class_attribute :field_names, default: []
8
+ attr_reader :resource
9
+
10
+ #
11
+ # binding multiple
12
+ #
13
+ class Multiple
14
+ def self.bind(connector:, binder:, query:, block:)
15
+ multiple_binding_class = self
16
+ connector.field(binder, query) do |elements|
17
+ multiple_binding_class.new(binder.class).instance_exec(elements, &block)
18
+ end
19
+ end
20
+
21
+ def initialize(binder)
22
+ @binder = binder
23
+ end
24
+
25
+ def field(name)
26
+ @binder.field_names << name
27
+ @binder.field_names.uniq!
28
+ @binder.result_class.define(name)
29
+ result = yield
30
+ @binder.define_method(name) { result }
31
+ end
32
+ end
33
+
34
+ def initialize(resource)
35
+ @resource = resource
36
+ self.class.fields_call(self)
37
+ end
38
+
39
+ def result
40
+ new_result(field_names.map { |field_name| [field_name, send(field_name)] }.to_h)
41
+ end
42
+
43
+ def new_result(values)
44
+ fetched_at = Time.current
45
+ self.class.result_class.new(fetched_at: fetched_at, fetched_on: fetched_at.beginning_of_day, **values)
46
+ end
47
+
48
+ def self.query(name, query = nil, &block)
49
+ define_method(name) do
50
+ connector.field(self, query, &block)
51
+ end
52
+ end
53
+
54
+ def self.field(name, query = nil, &block)
55
+ field_names << name
56
+ field_names.uniq!
57
+ result_class.define(name)
58
+ define_method(name) do
59
+ connector.field(self, query, &block)
60
+ end
61
+ end
62
+
63
+ def self.fields(query, &block)
64
+ @fields = { query: query, block: block }
65
+ end
66
+
67
+ def self.fields_call(binder)
68
+ Multiple.bind(connector: connector, binder: binder, query: @fields[:query], block: @fields[:block]) if @fields
69
+ end
70
+ end
@@ -3,7 +3,9 @@
3
3
  #
4
4
  # Nokogiri wrapper
5
5
  #
6
- class Botz::ResourceAccessor::DirectHtml
6
+ class Botz::Connector::DirectHtml
7
+ include ::Botz::Connector::Html::Field
8
+
7
9
  def initialize(encoding: nil)
8
10
  @encoding = encoding
9
11
  end
@@ -3,7 +3,21 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- class Botz::ResourceAccessor::Html
6
+ class Botz::Connector::Html
7
+ #
8
+ # field macro
9
+ #
10
+ module Field
11
+ def field(object, query, &block)
12
+ node = object.resource.search(query)
13
+ fail "Could not be located #{query}" if node.nil?
14
+ return node.first.text if block.nil?
15
+
16
+ object.instance_exec(node, &block)
17
+ end
18
+ end
19
+ include Field
20
+
7
21
  USER_AGENT = [
8
22
  'Mozilla/5.0',
9
23
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # This class is responsible for actually making a network connection and downloading hypertext
5
5
  #
6
- module Botz::ResourceAccessor
6
+ module Botz::Connector
7
7
  extend ActiveSupport::Autoload
8
8
  autoload :Html
9
9
  autoload :DirectHtml
@@ -7,7 +7,7 @@ class Botz::Definition
7
7
  class_attribute :domain
8
8
  class_attribute :spiders, default: {}
9
9
  class_attribute :scrapers, default: {}
10
- class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
10
+ class_attribute :output, default: ->(result) { STDOUT.puts(result.attributes.to_json) }
11
11
 
12
12
  def output(&block)
13
13
  self.output = block
@@ -15,35 +15,14 @@ class Botz::Definition
15
15
 
16
16
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
17
17
  class << self
18
- def scraper(name, encoding: nil, as: :html, &block)
19
- class_name = "#{name}_scraper".classify
20
- accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
21
- accessor = accessor_class.new(encoding: encoding)
22
- binder_base = Botz::Scraper.const_get(as.to_s.classify)
23
- binder = Class.new(binder_base, &block)
24
- binder.define_singleton_method(:name) { class_name }
25
- crawler_class = self
26
- scraper_class = Class.new do
27
- define_singleton_method(:crawler_class) { crawler_class }
28
- define_singleton_method(:bind) do |url|
29
- accessor.call(url) do |resource|
30
- binder.new(scraper_class, resource)
31
- end
32
- end
33
- define_singleton_method(:call) { |url, &output| bind(url).call(&output) }
34
- end
35
- const_set(class_name, scraper_class)
36
- scrapers[name] = scraper_class
37
- end
38
-
39
18
  def spider(name, start_url = nil, encoding: nil, as: :html, &block)
40
- accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
41
- accessor = accessor_class.new(start_url: start_url, encoding: encoding)
19
+ connector_class = Botz::Connector.const_get(as.to_s.classify)
20
+ connector = connector_class.new(start_url: start_url, encoding: encoding)
42
21
  spider = Botz::Spider.new(&block)
43
22
  spider_class = Class.new do
44
- define_singleton_method(:accessor) { accessor }
23
+ define_singleton_method(:connector) { connector }
45
24
  define_singleton_method(:call) do |url = start_url, &spider_block|
46
- accessor.call(url) do |resource|
25
+ connector.call(url) do |resource|
47
26
  spider.call(resource, &spider_block)
48
27
  end
49
28
  end
@@ -51,6 +30,74 @@ class Botz::Definition
51
30
  const_set("#{name}_spider".classify, spider_class)
52
31
  spiders[name] = spider_class
53
32
  end
33
+
34
+ def scraper(name, options, &block)
35
+ if options[:loop]
36
+ loop_scraper(name, options, &block)
37
+ else
38
+ normal_scraper(name, **options, &block)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def loop_scraper(name, options, &block)
45
+ options = { as: :html, start_url: nil, encoding: nil, loop: nil }.merge(options)
46
+ result_class = Class.new(Botz::Result)
47
+
48
+ # connector
49
+ connector_class = Botz::Connector.const_get(options[:as].to_s.classify)
50
+ connector = connector_class.new(encoding: options[:encoding])
51
+
52
+ namespace = Class.new do
53
+ binder = Class.new(Botz::Binder) do
54
+ define_singleton_method(:connector) { connector }
55
+ define_singleton_method(:result_class) { result_class }
56
+ define_method(:connector) { connector }
57
+ instance_exec(&block)
58
+ end
59
+ define_singleton_method(:call) do |url = options[:start_url], &yielder|
60
+ connector.call(url) do |resource|
61
+ looper = Botz::Looper.new(resource, binder, options[:loop])
62
+ looper.call(&yielder)
63
+ end
64
+ end
65
+ end
66
+ const_set("#{name}_scraper".classify, namespace)
67
+ scrapers[name] = namespace
68
+ end
69
+
70
+ def normal_scraper(name, encoding: nil, as: :html, &block)
71
+ # result
72
+ result_class = Class.new(Botz::Result)
73
+
74
+ # connector
75
+ connector_class = Botz::Connector.const_get(as.to_s.classify)
76
+ connector = connector_class.new(encoding: encoding)
77
+
78
+ # namespace
79
+ namespace = Class.new do
80
+ binder = Class.new(Botz::Binder) do
81
+ define_singleton_method(:connector) { connector }
82
+ define_singleton_method(:result_class) { result_class }
83
+ define_method(:connector) { connector }
84
+ instance_exec(&block)
85
+ end
86
+ define_singleton_method(:bind) do |url|
87
+ connector.call(url) do |resource|
88
+ binder.new(resource)
89
+ end
90
+ end
91
+ define_singleton_method(:call) do |url, &output|
92
+ result = bind(url).result
93
+ fail "#{url}\n#{result.errors.full_messages}" if result.invalid?
94
+
95
+ output.call(result)
96
+ end
97
+ end
98
+ const_set("#{name}_scraper".classify, namespace)
99
+ scrapers[name] = namespace
100
+ end
54
101
  end
55
102
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
56
103
  end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # looper
5
+ #
6
+ class Botz::Looper
7
+ def initialize(resource, binder, loop_block)
8
+ @resource = resource
9
+ @binder = binder
10
+ @loop_block = loop_block
11
+ end
12
+
13
+ def call
14
+ yielder = lambda do |element|
15
+ result = @binder.new(element).result
16
+ fail "#{element}\n\n#{result.errors.full_messages}" if result.invalid?
17
+
18
+ yield result
19
+ end
20
+ @loop_block.call(@resource, yielder)
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Scrape results
5
+ #
6
+ class Botz::Result
7
+ include ActiveModel::Model
8
+ include ActiveModel::Attributes
9
+
10
+ def self.define(name)
11
+ case name
12
+ when /.*\?/
13
+ attribute name, :boolean
14
+ validates name, inclusion: { in: [true, false] }
15
+ else
16
+ attribute name
17
+ validates name, presence: true, allow_blank: true
18
+ end
19
+ end
20
+
21
+ attribute :fetched_at
22
+ attribute :fetched_on
23
+ end
data/lib/botz/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Botz
4
- VERSION = '0.4.0'
4
+ VERSION = '0.6.0'
5
5
  end
data/lib/botz.rb CHANGED
@@ -14,9 +14,11 @@ module Botz
14
14
  autoload :Console
15
15
  autoload :Definition
16
16
  autoload :DefinitionFile
17
+ autoload :Binder
17
18
  autoload :Spider
18
- autoload :Scraper
19
- autoload :ResourceAccessor
19
+ autoload :Looper
20
+ autoload :Connector
21
+ autoload :Result
20
22
 
21
23
  const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
22
24
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: botz
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-07 00:00:00.000000000 Z
11
+ date: 2019-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -148,15 +148,15 @@ files:
148
148
  - botz.gemspec
149
149
  - exe/botz
150
150
  - lib/botz.rb
151
+ - lib/botz/binder.rb
152
+ - lib/botz/connector.rb
153
+ - lib/botz/connector/direct_html.rb
154
+ - lib/botz/connector/html.rb
151
155
  - lib/botz/console.rb
152
156
  - lib/botz/definition.rb
153
157
  - lib/botz/definition_file.rb
154
- - lib/botz/resource_accessor.rb
155
- - lib/botz/resource_accessor/direct_html.rb
156
- - lib/botz/resource_accessor/html.rb
157
- - lib/botz/scraper.rb
158
- - lib/botz/scraper/direct_html.rb
159
- - lib/botz/scraper/html.rb
158
+ - lib/botz/looper.rb
159
+ - lib/botz/result.rb
160
160
  - lib/botz/shell.rb
161
161
  - lib/botz/spider.rb
162
162
  - lib/botz/version.rb
@@ -1,64 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # direct resource to html scraping
5
- #
6
- class Botz::Scraper::DirectHtml
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
-
10
- #
11
- # Scraper error class
12
- #
13
- class Error < StandardError
14
- def initialize(scraper_class, errors)
15
- super("#{scraper_class} # #{errors.full_messages}")
16
- end
17
- end
18
-
19
- attr_reader :scraper_class
20
- attr_reader :html
21
-
22
- def initialize(scraper_class, resource)
23
- @scraper_class = scraper_class
24
- @html = resource
25
- end
26
-
27
- class << self
28
- def field_names
29
- @field_names ||= []
30
- end
31
- end
32
-
33
- def to_h
34
- fetched_at = Time.current
35
- fetched_on = fetched_at.beginning_of_day
36
- timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
37
- self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
38
- end
39
-
40
- def call
41
- fail Error.new(scraper_class, errors) if invalid?
42
-
43
- yield(to_h)
44
- end
45
-
46
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
47
- def self.field(name, path = nil, persist: true, &block)
48
- if persist
49
- field_names << name
50
- case name
51
- when /.*\?/
52
- validates name, inclusion: { in: [true, false] }
53
- else
54
- validates name, presence: true, allow_blank: true
55
- end
56
- end
57
-
58
- return define_method(name) { instance_exec(html, &block) } if path.nil?
59
- return define_method(name) { html.search(path).text.strip } if block.nil?
60
-
61
- define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
62
- end
63
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
- end
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # DSL for parsing html into objects
5
- #
6
- class Botz::Scraper::Html
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
-
10
- #
11
- # Scraper error class
12
- #
13
- class Error < StandardError
14
- def initialize(scraper_class, url, errors)
15
- super("#{scraper_class}(#{url}) # #{errors.full_messages}")
16
- end
17
- end
18
-
19
- attr_reader :scraper_class
20
- attr_reader :url
21
- attr_reader :html
22
-
23
- def initialize(scraper_class, resource)
24
- @scraper_class = scraper_class
25
- @url = resource.uri
26
- @html = resource
27
- end
28
-
29
- class << self
30
- def field_names
31
- @field_names ||= []
32
- end
33
- end
34
-
35
- def primary_key
36
- url.to_s
37
- end
38
-
39
- def to_h
40
- fetched_at = Time.current
41
- fetched_on = fetched_at.beginning_of_day
42
- timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
43
- self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
44
- end
45
-
46
- def call
47
- fail Error.new(scraper_class, url, errors) if invalid?
48
-
49
- yield(to_h)
50
- end
51
-
52
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
53
- def self.field(name, path = nil, persist: true, &block)
54
- if persist
55
- field_names << name
56
- case name
57
- when /.*\?/
58
- validates name, inclusion: { in: [true, false] }
59
- else
60
- validates name, presence: true, allow_blank: true
61
- end
62
- end
63
-
64
- return define_method(name) { instance_exec(html, &block) } if path.nil?
65
- return define_method(name) { html.search(path).text.strip } if block.nil?
66
-
67
- define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
68
- end
69
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
70
- end
data/lib/botz/scraper.rb DELETED
@@ -1,10 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # scraper namespace
5
- #
6
- module Botz::Scraper
7
- extend ActiveSupport::Autoload
8
- autoload :Html
9
- autoload :DirectHtml
10
- end