spidy 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6e8adabd5eef99735eb1e4ab44609e2f25a859b5604bb2512a8692b72dd0c9e
4
- data.tar.gz: ab748cec4971ae35044860ddc26465aaac67d56f47677919f47a3562bd29080a
3
+ metadata.gz: d88e7028d82156f7710e85af609aa8c703a92b5d48f92af5047b39f64e98b4ad
4
+ data.tar.gz: 984a44e45d06d72a0e8eaaa3c142f13fe81c5f77b0606b4dc883b695ae024f7a
5
5
  SHA512:
6
- metadata.gz: 747950afd87f68df3572aeb8859af14087a75bd1786c5183719ce94f7964c397e9b81a58c563f1a54883322620b7aa64d580144ec25e0fe4f85c20463ed21d68
7
- data.tar.gz: c2d972491eedbdb5308ceda060c71c95a409f9b4968a5897baf93a137d8f17411ad1afa9bfed7e93b0b5a177a706a358da56dc28a86e7dfe3f65152174b22ce8
6
+ metadata.gz: e2c6dafc543a85fc7c58ca9d7b3b9dc9fe3823fbe35aad3dfa60b7fece6434e3b0c4fbef6b507bd73ed7e827611a01d065b89df0b9d33e06b203707801eddaee
7
+ data.tar.gz: 82466dba270ffbec392383edd7afdad91d6c742d3580b733ef489e5199fd5c44956b70bd43bcb64d9e91b161e7aedf18fef96c4965ae655cece11b9ab585ebfc
data/.rubocop.yml CHANGED
@@ -25,3 +25,6 @@ Metrics/BlockLength:
25
25
 
26
26
  SignalException:
27
27
  EnforcedStyle: semantic
28
+
29
+ Layout/EmptyLineAfterGuardClause:
30
+ Enabled: false
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.0.7)
4
+ spidy (0.0.8)
5
5
  activemodel (~> 5.2)
6
6
  activesupport (~> 5.2)
7
7
  mechanize
data/README.md CHANGED
@@ -20,7 +20,44 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- TODO: Write usage instructions here
23
+ ### When used from the command line
24
+
25
+ website.rb
26
+ ```rb
27
+ Spidy.defin do
28
+ spider(as: :html) do |yielder, connector, url|
29
+ connector.call(url) do |html|
30
+ # html as nokogiri object ( mechanize )
31
+ yielder.call(url)
32
+ end
33
+ end
34
+
35
+ define(as: :html) do
36
+ let(:object_name, 'nokogiri query')
37
+ end
38
+ end
39
+ ```
40
+ ```bash
41
+ echo 'http://example.com' | spidy each website.rb > urls
42
+ cat urls | spidy call website.rb > website.json
43
+ # shorthands
44
+ echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
45
+ ```
46
+
47
+ ### When used from the ruby code
48
+ ``
49
+ a = Spidy.define do
50
+ # Implementing spiders and scrapers
51
+ end
52
+
53
+ a.each(url) do |url|
54
+ # Loop for the number of retrieved URLs
55
+ end
56
+
57
+ a.call(url) do |object|
58
+ # The scrape result is passed as a defined object
59
+ end
60
+ ```
24
61
 
25
62
  ## Development
26
63
 
data/lib/spidy.rb CHANGED
@@ -29,6 +29,12 @@ module Spidy
29
29
  end
30
30
 
31
31
  def self.define(&block)
32
- Class.new(::Spidy::Definition, &block)
32
+ Module.new do
33
+ class_eval do
34
+ extend ::Spidy::Definition
35
+ module_eval(&block)
36
+ end
37
+ end
38
+ # Module.new(::Spidy::Definition, &block)
33
39
  end
34
40
  end
@@ -4,6 +4,18 @@
4
4
  # Mechanize wrapper
5
5
  #
6
6
  module Spidy::Connector::Html
7
+ #
8
+ # retry class
9
+ #
10
+ class Retry < StandardError
11
+ attr_reader :response_code
12
+ attr_reader :wait_time
13
+
14
+ def initialize(wait_time: 2, page: nil, error: nil)
15
+ @wait_time = wait_time
16
+ @response_code = error.try(:response_code) || page.try(:response_code)
17
+ end
18
+ end
7
19
  USER_AGENT = [
8
20
  'Mozilla/5.0',
9
21
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -13,33 +25,51 @@ module Spidy::Connector::Html
13
25
  'Safari/537.36'
14
26
  ].join(' ')
15
27
 
28
+ @logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
16
29
  @agent = Mechanize.new
17
30
  @agent.user_agent = USER_AGENT
18
31
 
19
32
  class << self
20
33
  attr_reader :agent
34
+ attr_accessor :logger
21
35
 
22
36
  def call(url, encoding: nil, retry_count: 3, &yielder)
23
37
  if encoding
24
- @agent.default_encoding = encoding
25
- @agent.force_default_encoding = true
38
+ agent.default_encoding = encoding
39
+ agent.force_default_encoding = true
26
40
  end
41
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
42
  get(url, retry_count, yielder)
28
43
  end
29
44
 
30
45
  private
31
46
 
47
+ # rubocop:disable Metrics/MethodLength
32
48
  def get(url, retry_count, yielder)
33
- @agent.get(url, &yielder)
49
+ agent.get(url) do |page|
50
+ fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
51
+
52
+ yielder.call(page)
53
+ end
34
54
  rescue Mechanize::ResponseCodeError => e
55
+ raise Retry, error: e if e.response_code == '429'
56
+ raise e
57
+ rescue Retry => e
58
+ logger.call('retry.accessed': Time.current,
59
+ 'retry.uri': url,
60
+ 'retry.response_code': e.response_code,
61
+ 'retry.rest_count': retry_count)
62
+
63
+ @agent = Mechanize.new
64
+ @agent.user_agent = USER_AGENT
65
+
35
66
  retry_count -= 1
36
- case e.response_code
37
- when '429'
38
- sleep 2
39
- retry if retry_count
40
- else
41
- raise e
67
+ if retry_count.positive?
68
+ sleep e.wait_time
69
+ retry
42
70
  end
71
+ raise e
43
72
  end
73
+ # rubocop:enable Metrics/MethodLength
44
74
  end
45
75
  end
data/lib/spidy/console.rb CHANGED
@@ -5,7 +5,7 @@
5
5
  #
6
6
  class Spidy::Console
7
7
  attr_reader :definition_file
8
- delegate :namespace, :spiders, to: :definition_file
8
+ delegate :spidy, to: :definition_file
9
9
 
10
10
  def initialize(definition_file = nil)
11
11
  @definition_file = definition_file
@@ -20,10 +20,10 @@ class Spidy::Console
20
20
  end
21
21
 
22
22
  def call(name = :default, url = nil, &block)
23
- namespace[name].call(url, &block)
23
+ spidy.call(name, url: url, &block)
24
24
  end
25
25
 
26
26
  def each(name = :default, url = nil, &block)
27
- spiders[name].call(url, &block)
27
+ spidy.each(name, url: url, &block)
28
28
  end
29
29
  end
@@ -3,35 +3,60 @@
3
3
  #
4
4
  # Class representing a website defined by DSL
5
5
  #
6
- class Spidy::Definition
7
- class_attribute :namespace, default: {}
8
- class_attribute :spiders, default: {}
9
-
10
- class << self
11
- def spider(name = :default, connector: nil, as: nil)
12
- connector = Spidy::Connector.get(as || connector) || connector
13
- spiders[name] = proc do |url, &yielder|
14
- yield(yielder, connector, url)
15
- end
16
- end
6
+ module Spidy::Definition
7
+ def call(name = :default, url: nil, stream: nil, err: nil, &output)
8
+ name = name.presence || :default
9
+ spidy = @namespace["#{name}_scraper"]
10
+ fail "undefined spidy [#{name}]" if spidy.nil?
11
+
12
+ exec(spidy, url: url, stream: stream, err: err, &output)
13
+ end
17
14
 
18
- def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
19
- connector = Spidy::Connector.get(as || connector) || connector
20
- binder = Spidy::Binder.get(as || binder) || binder
21
- namespace[name] = define_proc(connector, binder, define_block)
15
+ def each(name = :default, url: nil, stream: nil, err: nil, &output)
16
+ name = name.presence || :default
17
+ spidy = @namespace["#{name}_spider"]
18
+ fail "undefined spidy [#{name}]" if spidy.nil?
19
+
20
+ exec(spidy, url: url, stream: stream, err: err, &output)
21
+ end
22
+
23
+ def spider(name = :default, connector: nil, as: nil)
24
+ @namespace ||= {}
25
+ connector = Spidy::Connector.get(as || connector) || connector
26
+ @namespace["#{name}_spider"] = proc do |url, &yielder|
27
+ yield(yielder, connector, url)
22
28
  end
29
+ end
23
30
 
24
- private
31
+ def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
32
+ @namespace ||= {}
33
+ connector = Spidy::Connector.get(as || connector) || connector
34
+ binder = Spidy::Binder.get(as || binder) || binder
35
+ @namespace["#{name}_scraper"] = define_proc(connector, binder, define_block)
36
+ end
37
+
38
+ private
39
+
40
+ def exec(spidy, url: nil, stream: nil, err: nil, &output)
41
+ return spidy.call(url, &output) if stream.nil?
42
+
43
+ stream.each do |value|
44
+ spidy.call(value.strip, &output)
45
+ rescue StandardError => e
46
+ raise e if err.nil?
47
+
48
+ err.call(e, value.strip)
49
+ end
50
+ end
25
51
 
26
- def define_proc(connector, binder, define_block)
27
- proc do |url, &yielder|
28
- fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
52
+ def define_proc(connector, binder, define_block)
53
+ proc do |url, &yielder|
54
+ fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
29
55
 
30
- connection_yielder = lambda do |resource|
31
- binder.call(resource, define_block) { |object| yielder.call(object) }
32
- end
33
- connector.call(url, &connection_yielder)
56
+ connection_yielder = lambda do |resource|
57
+ binder.call(resource, define_block) { |object| yielder.call(object) }
34
58
  end
59
+ connector.call(url, &connection_yielder)
35
60
  end
36
61
  end
37
62
  end
@@ -5,14 +5,7 @@
5
5
  #
6
6
  class Spidy::DefinitionFile
7
7
  attr_reader :path
8
- attr_reader :definition
9
- delegate :namespace, :spiders, to: :definition
10
-
11
- CSV = lambda do |result|
12
- ::CSV.generate do |csv|
13
- csv << result.definition.attributes_to_array
14
- end
15
- end
8
+ attr_reader :spidy
16
9
 
17
10
  def self.open(filepath)
18
11
  object = new(filepath)
@@ -22,7 +15,7 @@ class Spidy::DefinitionFile
22
15
 
23
16
  # rubocop:disable Security/Eval
24
17
  def eval_definition
25
- @definition = eval(File.open(path).read)
18
+ @spidy = eval(File.open(path).read)
26
19
  end
27
20
  # rubocop:enable Security/Eval
28
21
 
data/lib/spidy/shell.rb CHANGED
@@ -6,20 +6,25 @@ require 'pry'
6
6
  # spidy shell interface
7
7
  #
8
8
  class Spidy::Shell
9
- attr_reader :definition_file
10
- class_attribute :output_yielder, default: (proc { |result| STDOUT.puts(result.to_s) })
11
- delegate :namespace, :spiders, to: :definition_file
9
+ attr_reader :definition_file, :spidy
10
+ class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
11
+ class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
12
+ delegate :spidy, to: :definition_file
12
13
 
13
14
  def initialize(definition_file)
14
15
  @definition_file = definition_file
15
16
  end
16
17
 
17
18
  def call(name)
18
- exec(namespace[name&.to_sym] || namespace.values.first)
19
+ return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
20
+
21
+ spidy.call(name, err: error_handler, &output)
19
22
  end
20
23
 
21
24
  def each(name)
22
- exec(spiders[name&.to_sym] || spiders.values.first)
25
+ return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
26
+
27
+ spidy.each(name, err: error_handler, &output)
23
28
  end
24
29
 
25
30
  def function
@@ -66,17 +71,4 @@ class Spidy::Shell
66
71
  RUBY
67
72
  end
68
73
  end
69
-
70
- private
71
-
72
- def exec(command)
73
- fail "undefined commmand[#{name}]" if command.nil?
74
- return command.call(&output_yielder) unless FileTest.pipe?(STDIN)
75
-
76
- STDIN.each do |line|
77
- command.call(line.strip, &output_yielder)
78
- rescue StandardError => e
79
- STDERR.puts("#{line.strip}\n #{e.message}")
80
- end
81
- end
82
74
  end
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.0.8'
4
+ VERSION = '0.0.9'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-20 00:00:00.000000000 Z
11
+ date: 2019-10-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler