spidy 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6e8adabd5eef99735eb1e4ab44609e2f25a859b5604bb2512a8692b72dd0c9e
4
- data.tar.gz: ab748cec4971ae35044860ddc26465aaac67d56f47677919f47a3562bd29080a
3
+ metadata.gz: d88e7028d82156f7710e85af609aa8c703a92b5d48f92af5047b39f64e98b4ad
4
+ data.tar.gz: 984a44e45d06d72a0e8eaaa3c142f13fe81c5f77b0606b4dc883b695ae024f7a
5
5
  SHA512:
6
- metadata.gz: 747950afd87f68df3572aeb8859af14087a75bd1786c5183719ce94f7964c397e9b81a58c563f1a54883322620b7aa64d580144ec25e0fe4f85c20463ed21d68
7
- data.tar.gz: c2d972491eedbdb5308ceda060c71c95a409f9b4968a5897baf93a137d8f17411ad1afa9bfed7e93b0b5a177a706a358da56dc28a86e7dfe3f65152174b22ce8
6
+ metadata.gz: e2c6dafc543a85fc7c58ca9d7b3b9dc9fe3823fbe35aad3dfa60b7fece6434e3b0c4fbef6b507bd73ed7e827611a01d065b89df0b9d33e06b203707801eddaee
7
+ data.tar.gz: 82466dba270ffbec392383edd7afdad91d6c742d3580b733ef489e5199fd5c44956b70bd43bcb64d9e91b161e7aedf18fef96c4965ae655cece11b9ab585ebfc
data/.rubocop.yml CHANGED
@@ -25,3 +25,6 @@ Metrics/BlockLength:
25
25
 
26
26
  SignalException:
27
27
  EnforcedStyle: semantic
28
+
29
+ Layout/EmptyLineAfterGuardClause:
30
+ Enabled: false
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.0.7)
4
+ spidy (0.0.8)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
data/README.md CHANGED
@@ -20,7 +20,44 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- TODO: Write usage instructions here
23
+ ### When used from the command line
24
+
25
+ website.rb
26
+ ```rb
27
+ Spidy.defin do
28
+ spider(as: :html) do |yielder, connector, url|
29
+ connector.call(url) do |html|
30
+ # html as nokogiri object ( mechanize )
31
+ yielder.call(url)
32
+ end
33
+ end
34
+
35
+ define(as: :html) do
36
+ let(:object_name, 'nokogiri query')
37
+ end
38
+ end
39
+ ```
40
+ ```bash
41
+ echo 'http://example.com' | spidy each website.rb > urls
42
+ cat urls | spidy call website.rb > website.json
43
+ # shorthands
44
+ echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
45
+ ```
46
+
47
+ ### When used from the ruby code
48
+ ``
49
+ a = Spidy.define do
50
+ # Implementing spiders and scrapers
51
+ end
52
+
53
+ a.each(url) do |url|
54
+ # Loop for the number of retrieved URLs
55
+ end
56
+
57
+ a.call(url) do |object|
58
+ # The scrape result is passed as a defined object
59
+ end
60
+ ```
24
61
 
25
62
  ## Development
26
63
 
data/lib/spidy.rb CHANGED
@@ -29,6 +29,12 @@ module Spidy
29
29
  end
30
30
 
31
31
  def self.define(&block)
32
- Class.new(::Spidy::Definition, &block)
32
+ Module.new do
33
+ class_eval do
34
+ extend ::Spidy::Definition
35
+ module_eval(&block)
36
+ end
37
+ end
38
+ # Module.new(::Spidy::Definition, &block)
33
39
  end
34
40
  end
@@ -4,6 +4,18 @@
4
4
  # Mechanize wrapper
5
5
  #
6
6
  module Spidy::Connector::Html
7
+ #
8
+ # retry class
9
+ #
10
+ class Retry < StandardError
11
+ attr_reader :response_code
12
+ attr_reader :wait_time
13
+
14
+ def initialize(wait_time: 2, page: nil, error: nil)
15
+ @wait_time = wait_time
16
+ @response_code = error.try(:response_code) || page.try(:response_code)
17
+ end
18
+ end
7
19
  USER_AGENT = [
8
20
  'Mozilla/5.0',
9
21
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -13,33 +25,51 @@ module Spidy::Connector::Html
13
25
  'Safari/537.36'
14
26
  ].join(' ')
15
27
 
28
+ @logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
16
29
  @agent = Mechanize.new
17
30
  @agent.user_agent = USER_AGENT
18
31
 
19
32
  class << self
20
33
  attr_reader :agent
34
+ attr_accessor :logger
21
35
 
22
36
  def call(url, encoding: nil, retry_count: 3, &yielder)
23
37
  if encoding
24
- @agent.default_encoding = encoding
25
- @agent.force_default_encoding = true
38
+ agent.default_encoding = encoding
39
+ agent.force_default_encoding = true
26
40
  end
41
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
42
  get(url, retry_count, yielder)
28
43
  end
29
44
 
30
45
  private
31
46
 
47
+ # rubocop:disable Metrics/MethodLength
32
48
  def get(url, retry_count, yielder)
33
- @agent.get(url, &yielder)
49
+ agent.get(url) do |page|
50
+ fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
51
+
52
+ yielder.call(page)
53
+ end
34
54
  rescue Mechanize::ResponseCodeError => e
55
+ raise Retry, error: e if e.response_code == '429'
56
+ raise e
57
+ rescue Retry => e
58
+ logger.call('retry.accessed': Time.current,
59
+ 'retry.uri': url,
60
+ 'retry.response_code': e.response_code,
61
+ 'retry.rest_count': retry_count)
62
+
63
+ @agent = Mechanize.new
64
+ @agent.user_agent = USER_AGENT
65
+
35
66
  retry_count -= 1
36
- case e.response_code
37
- when '429'
38
- sleep 2
39
- retry if retry_count
40
- else
41
- raise e
67
+ if retry_count.positive?
68
+ sleep e.wait_time
69
+ retry
42
70
  end
71
+ raise e
43
72
  end
73
+ # rubocop:enable Metrics/MethodLength
44
74
  end
45
75
  end
data/lib/spidy/console.rb CHANGED
@@ -5,7 +5,7 @@
5
5
  #
6
6
  class Spidy::Console
7
7
  attr_reader :definition_file
8
- delegate :namespace, :spiders, to: :definition_file
8
+ delegate :spidy, to: :definition_file
9
9
 
10
10
  def initialize(definition_file = nil)
11
11
  @definition_file = definition_file
@@ -20,10 +20,10 @@ class Spidy::Console
20
20
  end
21
21
 
22
22
  def call(name = :default, url = nil, &block)
23
- namespace[name].call(url, &block)
23
+ spidy.call(name, url: url, &block)
24
24
  end
25
25
 
26
26
  def each(name = :default, url = nil, &block)
27
- spiders[name].call(url, &block)
27
+ spidy.each(name, url: url, &block)
28
28
  end
29
29
  end
@@ -3,35 +3,60 @@
3
3
  #
4
4
  # Class representing a website defined by DSL
5
5
  #
6
- class Spidy::Definition
7
- class_attribute :namespace, default: {}
8
- class_attribute :spiders, default: {}
9
-
10
- class << self
11
- def spider(name = :default, connector: nil, as: nil)
12
- connector = Spidy::Connector.get(as || connector) || connector
13
- spiders[name] = proc do |url, &yielder|
14
- yield(yielder, connector, url)
15
- end
16
- end
6
+ module Spidy::Definition
7
+ def call(name = :default, url: nil, stream: nil, err: nil, &output)
8
+ name = name.presence || :default
9
+ spidy = @namespace["#{name}_scraper"]
10
+ fail "undefined spidy [#{name}]" if spidy.nil?
11
+
12
+ exec(spidy, url: url, stream: stream, err: err, &output)
13
+ end
17
14
 
18
- def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
19
- connector = Spidy::Connector.get(as || connector) || connector
20
- binder = Spidy::Binder.get(as || binder) || binder
21
- namespace[name] = define_proc(connector, binder, define_block)
15
+ def each(name = :default, url: nil, stream: nil, err: nil, &output)
16
+ name = name.presence || :default
17
+ spidy = @namespace["#{name}_spider"]
18
+ fail "undefined spidy [#{name}]" if spidy.nil?
19
+
20
+ exec(spidy, url: url, stream: stream, err: err, &output)
21
+ end
22
+
23
+ def spider(name = :default, connector: nil, as: nil)
24
+ @namespace ||= {}
25
+ connector = Spidy::Connector.get(as || connector) || connector
26
+ @namespace["#{name}_spider"] = proc do |url, &yielder|
27
+ yield(yielder, connector, url)
22
28
  end
29
+ end
23
30
 
24
- private
31
+ def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
32
+ @namespace ||= {}
33
+ connector = Spidy::Connector.get(as || connector) || connector
34
+ binder = Spidy::Binder.get(as || binder) || binder
35
+ @namespace["#{name}_scraper"] = define_proc(connector, binder, define_block)
36
+ end
37
+
38
+ private
39
+
40
+ def exec(spidy, url: nil, stream: nil, err: nil, &output)
41
+ return spidy.call(url, &output) if stream.nil?
42
+
43
+ stream.each do |value|
44
+ spidy.call(value.strip, &output)
45
+ rescue StandardError => e
46
+ raise e if err.nil?
47
+
48
+ err.call(e, value.strip)
49
+ end
50
+ end
25
51
 
26
- def define_proc(connector, binder, define_block)
27
- proc do |url, &yielder|
28
- fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
52
+ def define_proc(connector, binder, define_block)
53
+ proc do |url, &yielder|
54
+ fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
29
55
 
30
- connection_yielder = lambda do |resource|
31
- binder.call(resource, define_block) { |object| yielder.call(object) }
32
- end
33
- connector.call(url, &connection_yielder)
56
+ connection_yielder = lambda do |resource|
57
+ binder.call(resource, define_block) { |object| yielder.call(object) }
34
58
  end
59
+ connector.call(url, &connection_yielder)
35
60
  end
36
61
  end
37
62
  end
@@ -5,14 +5,7 @@
5
5
  #
6
6
  class Spidy::DefinitionFile
7
7
  attr_reader :path
8
- attr_reader :definition
9
- delegate :namespace, :spiders, to: :definition
10
-
11
- CSV = lambda do |result|
12
- ::CSV.generate do |csv|
13
- csv << result.definition.attributes_to_array
14
- end
15
- end
8
+ attr_reader :spidy
16
9
 
17
10
  def self.open(filepath)
18
11
  object = new(filepath)
@@ -22,7 +15,7 @@ class Spidy::DefinitionFile
22
15
 
23
16
  # rubocop:disable Security/Eval
24
17
  def eval_definition
25
- @definition = eval(File.open(path).read)
18
+ @spidy = eval(File.open(path).read)
26
19
  end
27
20
  # rubocop:enable Security/Eval
28
21
 
data/lib/spidy/shell.rb CHANGED
@@ -6,20 +6,25 @@ require 'pry'
6
6
  # spidy shell interface
7
7
  #
8
8
  class Spidy::Shell
9
- attr_reader :definition_file
10
- class_attribute :output_yielder, default: (proc { |result| STDOUT.puts(result.to_s) })
11
- delegate :namespace, :spiders, to: :definition_file
9
+ attr_reader :definition_file, :spidy
10
+ class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
11
+ class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
12
+ delegate :spidy, to: :definition_file
12
13
 
13
14
  def initialize(definition_file)
14
15
  @definition_file = definition_file
15
16
  end
16
17
 
17
18
  def call(name)
18
- exec(namespace[name&.to_sym] || namespace.values.first)
19
+ return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
20
+
21
+ spidy.call(name, err: error_handler, &output)
19
22
  end
20
23
 
21
24
  def each(name)
22
- exec(spiders[name&.to_sym] || spiders.values.first)
25
+ return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
26
+
27
+ spidy.each(name, err: error_handler, &output)
23
28
  end
24
29
 
25
30
  def function
@@ -66,17 +71,4 @@ class Spidy::Shell
66
71
  RUBY
67
72
  end
68
73
  end
69
-
70
- private
71
-
72
- def exec(command)
73
- fail "undefined commmand[#{name}]" if command.nil?
74
- return command.call(&output_yielder) unless FileTest.pipe?(STDIN)
75
-
76
- STDIN.each do |line|
77
- command.call(line.strip, &output_yielder)
78
- rescue StandardError => e
79
- STDERR.puts("#{line.strip}\n #{e.message}")
80
- end
81
- end
82
74
  end
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.0.8'
4
+ VERSION = '0.0.9'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-20 00:00:00.000000000 Z
11
+ date: 2019-10-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler