botz 0.4.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe53c15b160a37521cde364f0a8dbcaf0daba0a7ef472dc8bbb560fc4666bef1
4
- data.tar.gz: 51ca8fd5e7272be7bc89b86d510f7b3486c86c3dd7fd42d3c04287d960702010
3
+ metadata.gz: 8dc85ac7df0f64087255c2aa722ba3e6151baca615ce00d9ce3c7cebd022f231
4
+ data.tar.gz: 7f77369c8e744b050b2b08273b35f1e95e299d8f8adad3e7760e41f920a10af9
5
5
  SHA512:
6
- metadata.gz: 10166b292f8c67d624a9c65c39471e02155235977c095b3423bbb7e4a83e45bfd345c8fad2161313678c7208b07c1e99d5ae76444178c4f55189aa5696c6d752
7
- data.tar.gz: 23bd88361bf928ac2c52fec73094e07d4f3e535f195fd8acc29d94bd8c49f3e428c6a52f47e61d6f558d48ebc9be941755311a47c4e03c9686e1497f0a78a095
6
+ metadata.gz: 4d7f5be597513d01464b23d6053d037b014008735745cb267d9299581efd8f11b1c5109929490b80cfbfee850c88202a3f0d298341a192cfe1f96212c72d8eee
7
+ data.tar.gz: 2f885e634ed4fe24ebd642bd65cf7c7c059eb1ec31d7a4e842cec0663527ba1da8366f719ded80a9e43346de8aa98e6f23564cde7f8e01265684bd44fbf5d5b5
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- botz (0.4.0)
4
+ botz (0.5.0)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Bind resource received from the connection to the result object
5
+ #
6
+ class Botz::Binder
7
+ class_attribute :field_names, default: []
8
+ attr_reader :resource
9
+
10
+ #
11
+ # binding multiple
12
+ #
13
+ class Multiple
14
+ def self.bind(connector:, binder:, query:, block:)
15
+ multiple_binding_class = self
16
+ connector.field(binder, query) do |elements|
17
+ multiple_binding_class.new(binder.class).instance_exec(elements, &block)
18
+ end
19
+ end
20
+
21
+ def initialize(binder)
22
+ @binder = binder
23
+ end
24
+
25
+ def field(name)
26
+ @binder.field_names << name
27
+ @binder.field_names.uniq!
28
+ @binder.result_class.define(name)
29
+ result = yield
30
+ @binder.define_method(name) { result }
31
+ end
32
+ end
33
+
34
+ def initialize(resource)
35
+ @resource = resource
36
+ self.class.fields_call(self)
37
+ end
38
+
39
+ def result
40
+ new_result(field_names.map { |field_name| [field_name, send(field_name)] }.to_h)
41
+ end
42
+
43
+ def new_result(values)
44
+ fetched_at = Time.current
45
+ self.class.result_class.new(fetched_at: fetched_at, fetched_on: fetched_at.beginning_of_day, **values)
46
+ end
47
+
48
+ def self.query(name, query = nil, &block)
49
+ define_method(name) do
50
+ connector.field(self, query, &block)
51
+ end
52
+ end
53
+
54
+ def self.field(name, query = nil, &block)
55
+ field_names << name
56
+ field_names.uniq!
57
+ result_class.define(name)
58
+ define_method(name) do
59
+ connector.field(self, query, &block)
60
+ end
61
+ end
62
+
63
+ def self.fields(query, &block)
64
+ @fields = { query: query, block: block }
65
+ end
66
+
67
+ def self.fields_call(binder)
68
+ Multiple.bind(connector: connector, binder: binder, query: @fields[:query], block: @fields[:block]) if @fields
69
+ end
70
+ end
@@ -3,7 +3,9 @@
3
3
  #
4
4
  # Nokogiri wrapper
5
5
  #
6
- class Botz::ResourceAccessor::DirectHtml
6
+ class Botz::Connector::DirectHtml
7
+ include ::Botz::Connector::Html::Field
8
+
7
9
  def initialize(encoding: nil)
8
10
  @encoding = encoding
9
11
  end
@@ -3,7 +3,21 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- class Botz::ResourceAccessor::Html
6
+ class Botz::Connector::Html
7
+ #
8
+ # field macro
9
+ #
10
+ module Field
11
+ def field(object, query, &block)
12
+ node = object.resource.search(query)
13
+ fail "Could not be located #{query}" if node.nil?
14
+ return node.first.text if block.nil?
15
+
16
+ object.instance_exec(node, &block)
17
+ end
18
+ end
19
+ include Field
20
+
7
21
  USER_AGENT = [
8
22
  'Mozilla/5.0',
9
23
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # This class is responsible for actually making a network connection and downloading hypertext
5
5
  #
6
- module Botz::ResourceAccessor
6
+ module Botz::Connector
7
7
  extend ActiveSupport::Autoload
8
8
  autoload :Html
9
9
  autoload :DirectHtml
@@ -7,7 +7,7 @@ class Botz::Definition
7
7
  class_attribute :domain
8
8
  class_attribute :spiders, default: {}
9
9
  class_attribute :scrapers, default: {}
10
- class_attribute :output, default: ->(result) { STDOUT.puts(result.to_json) }
10
+ class_attribute :output, default: ->(result) { STDOUT.puts(result.attributes.to_json) }
11
11
 
12
12
  def output(&block)
13
13
  self.output = block
@@ -15,35 +15,14 @@ class Botz::Definition
15
15
 
16
16
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
17
17
  class << self
18
- def scraper(name, encoding: nil, as: :html, &block)
19
- class_name = "#{name}_scraper".classify
20
- accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
21
- accessor = accessor_class.new(encoding: encoding)
22
- binder_base = Botz::Scraper.const_get(as.to_s.classify)
23
- binder = Class.new(binder_base, &block)
24
- binder.define_singleton_method(:name) { class_name }
25
- crawler_class = self
26
- scraper_class = Class.new do
27
- define_singleton_method(:crawler_class) { crawler_class }
28
- define_singleton_method(:bind) do |url|
29
- accessor.call(url) do |resource|
30
- binder.new(scraper_class, resource)
31
- end
32
- end
33
- define_singleton_method(:call) { |url, &output| bind(url).call(&output) }
34
- end
35
- const_set(class_name, scraper_class)
36
- scrapers[name] = scraper_class
37
- end
38
-
39
18
  def spider(name, start_url = nil, encoding: nil, as: :html, &block)
40
- accessor_class = Botz::ResourceAccessor.const_get(as.to_s.classify)
41
- accessor = accessor_class.new(start_url: start_url, encoding: encoding)
19
+ connector_class = Botz::Connector.const_get(as.to_s.classify)
20
+ connector = connector_class.new(start_url: start_url, encoding: encoding)
42
21
  spider = Botz::Spider.new(&block)
43
22
  spider_class = Class.new do
44
- define_singleton_method(:accessor) { accessor }
23
+ define_singleton_method(:connector) { connector }
45
24
  define_singleton_method(:call) do |url = start_url, &spider_block|
46
- accessor.call(url) do |resource|
25
+ connector.call(url) do |resource|
47
26
  spider.call(resource, &spider_block)
48
27
  end
49
28
  end
@@ -51,6 +30,74 @@ class Botz::Definition
51
30
  const_set("#{name}_spider".classify, spider_class)
52
31
  spiders[name] = spider_class
53
32
  end
33
+
34
+ def scraper(name, options, &block)
35
+ if options[:loop]
36
+ loop_scraper(name, options, &block)
37
+ else
38
+ normal_scraper(name, **options, &block)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def loop_scraper(name, options, &block)
45
+ options = { as: :html, start_url: nil, encoding: nil, loop: nil }.merge(options)
46
+ result_class = Class.new(Botz::Result)
47
+
48
+ # connector
49
+ connector_class = Botz::Connector.const_get(options[:as].to_s.classify)
50
+ connector = connector_class.new(encoding: options[:encoding])
51
+
52
+ namespace = Class.new do
53
+ binder = Class.new(Botz::Binder) do
54
+ define_singleton_method(:connector) { connector }
55
+ define_singleton_method(:result_class) { result_class }
56
+ define_method(:connector) { connector }
57
+ instance_exec(&block)
58
+ end
59
+ define_singleton_method(:call) do |url = options[:start_url], &yielder|
60
+ connector.call(url) do |resource|
61
+ looper = Botz::Looper.new(resource, binder, options[:loop])
62
+ looper.call(&yielder)
63
+ end
64
+ end
65
+ end
66
+ const_set("#{name}_scraper".classify, namespace)
67
+ scrapers[name] = namespace
68
+ end
69
+
70
+ def normal_scraper(name, encoding: nil, as: :html, &block)
71
+ # result
72
+ result_class = Class.new(Botz::Result)
73
+
74
+ # connector
75
+ connector_class = Botz::Connector.const_get(as.to_s.classify)
76
+ connector = connector_class.new(encoding: encoding)
77
+
78
+ # namespace
79
+ namespace = Class.new do
80
+ binder = Class.new(Botz::Binder) do
81
+ define_singleton_method(:connector) { connector }
82
+ define_singleton_method(:result_class) { result_class }
83
+ define_method(:connector) { connector }
84
+ instance_exec(&block)
85
+ end
86
+ define_singleton_method(:bind) do |url|
87
+ connector.call(url) do |resource|
88
+ binder.new(resource)
89
+ end
90
+ end
91
+ define_singleton_method(:call) do |url, &output|
92
+ result = bind(url).result
93
+ fail "#{url}\n#{result.errors.full_messages}" if result.invalid?
94
+
95
+ output.call(result)
96
+ end
97
+ end
98
+ const_set("#{name}_scraper".classify, namespace)
99
+ scrapers[name] = namespace
100
+ end
54
101
  end
55
102
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
56
103
  end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # looper
5
+ #
6
+ class Botz::Looper
7
+ def initialize(resource, binder, loop_block)
8
+ @resource = resource
9
+ @binder = binder
10
+ @loop_block = loop_block
11
+ end
12
+
13
+ def call
14
+ yielder = lambda do |element|
15
+ result = @binder.new(element).result
16
+ fail "#{element}\n\n#{result.errors.full_messages}" if result.invalid?
17
+
18
+ yield result
19
+ end
20
+ @loop_block.call(@resource, yielder)
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Scrape results
5
+ #
6
+ class Botz::Result
7
+ include ActiveModel::Model
8
+ include ActiveModel::Attributes
9
+
10
+ def self.define(name)
11
+ case name
12
+ when /.*\?/
13
+ attribute name, :boolean
14
+ validates name, inclusion: { in: [true, false] }
15
+ else
16
+ attribute name
17
+ validates name, presence: true, allow_blank: true
18
+ end
19
+ end
20
+
21
+ attribute :fetched_at
22
+ attribute :fetched_on
23
+ end
data/lib/botz/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Botz
4
- VERSION = '0.4.0'
4
+ VERSION = '0.6.0'
5
5
  end
data/lib/botz.rb CHANGED
@@ -14,9 +14,11 @@ module Botz
14
14
  autoload :Console
15
15
  autoload :Definition
16
16
  autoload :DefinitionFile
17
+ autoload :Binder
17
18
  autoload :Spider
18
- autoload :Scraper
19
- autoload :ResourceAccessor
19
+ autoload :Looper
20
+ autoload :Connector
21
+ autoload :Result
20
22
 
21
23
  const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
22
24
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: botz
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-07 00:00:00.000000000 Z
11
+ date: 2019-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -148,15 +148,15 @@ files:
148
148
  - botz.gemspec
149
149
  - exe/botz
150
150
  - lib/botz.rb
151
+ - lib/botz/binder.rb
152
+ - lib/botz/connector.rb
153
+ - lib/botz/connector/direct_html.rb
154
+ - lib/botz/connector/html.rb
151
155
  - lib/botz/console.rb
152
156
  - lib/botz/definition.rb
153
157
  - lib/botz/definition_file.rb
154
- - lib/botz/resource_accessor.rb
155
- - lib/botz/resource_accessor/direct_html.rb
156
- - lib/botz/resource_accessor/html.rb
157
- - lib/botz/scraper.rb
158
- - lib/botz/scraper/direct_html.rb
159
- - lib/botz/scraper/html.rb
158
+ - lib/botz/looper.rb
159
+ - lib/botz/result.rb
160
160
  - lib/botz/shell.rb
161
161
  - lib/botz/spider.rb
162
162
  - lib/botz/version.rb
@@ -1,64 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # direct resource to html scraping
5
- #
6
- class Botz::Scraper::DirectHtml
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
-
10
- #
11
- # Scraper error class
12
- #
13
- class Error < StandardError
14
- def initialize(scraper_class, errors)
15
- super("#{scraper_class} # #{errors.full_messages}")
16
- end
17
- end
18
-
19
- attr_reader :scraper_class
20
- attr_reader :html
21
-
22
- def initialize(scraper_class, resource)
23
- @scraper_class = scraper_class
24
- @html = resource
25
- end
26
-
27
- class << self
28
- def field_names
29
- @field_names ||= []
30
- end
31
- end
32
-
33
- def to_h
34
- fetched_at = Time.current
35
- fetched_on = fetched_at.beginning_of_day
36
- timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
37
- self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
38
- end
39
-
40
- def call
41
- fail Error.new(scraper_class, errors) if invalid?
42
-
43
- yield(to_h)
44
- end
45
-
46
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
47
- def self.field(name, path = nil, persist: true, &block)
48
- if persist
49
- field_names << name
50
- case name
51
- when /.*\?/
52
- validates name, inclusion: { in: [true, false] }
53
- else
54
- validates name, presence: true, allow_blank: true
55
- end
56
- end
57
-
58
- return define_method(name) { instance_exec(html, &block) } if path.nil?
59
- return define_method(name) { html.search(path).text.strip } if block.nil?
60
-
61
- define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
62
- end
63
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
- end
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # DSL for parsing html into objects
5
- #
6
- class Botz::Scraper::Html
7
- include ActiveModel::Model
8
- include ActiveModel::Attributes
9
-
10
- #
11
- # Scraper error class
12
- #
13
- class Error < StandardError
14
- def initialize(scraper_class, url, errors)
15
- super("#{scraper_class}(#{url}) # #{errors.full_messages}")
16
- end
17
- end
18
-
19
- attr_reader :scraper_class
20
- attr_reader :url
21
- attr_reader :html
22
-
23
- def initialize(scraper_class, resource)
24
- @scraper_class = scraper_class
25
- @url = resource.uri
26
- @html = resource
27
- end
28
-
29
- class << self
30
- def field_names
31
- @field_names ||= []
32
- end
33
- end
34
-
35
- def primary_key
36
- url.to_s
37
- end
38
-
39
- def to_h
40
- fetched_at = Time.current
41
- fetched_on = fetched_at.beginning_of_day
42
- timestamps = { fetched_on: fetched_on, fetched_at: fetched_at }
43
- self.class.field_names.map { |field| [field, send(field)] }.to_h.merge(timestamps)
44
- end
45
-
46
- def call
47
- fail Error.new(scraper_class, url, errors) if invalid?
48
-
49
- yield(to_h)
50
- end
51
-
52
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
53
- def self.field(name, path = nil, persist: true, &block)
54
- if persist
55
- field_names << name
56
- case name
57
- when /.*\?/
58
- validates name, inclusion: { in: [true, false] }
59
- else
60
- validates name, presence: true, allow_blank: true
61
- end
62
- end
63
-
64
- return define_method(name) { instance_exec(html, &block) } if path.nil?
65
- return define_method(name) { html.search(path).text.strip } if block.nil?
66
-
67
- define_method(name) { html.search(path).first.try { |e| instance_exec(e, &block) } }
68
- end
69
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
70
- end
data/lib/botz/scraper.rb DELETED
@@ -1,10 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # scraper namespace
5
- #
6
- module Botz::Scraper
7
- extend ActiveSupport::Autoload
8
- autoload :Html
9
- autoload :DirectHtml
10
- end