spidy 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d95a2c20a93c9c60969c4a738200391443f2e797273988d7aaa9a7abc812350
4
- data.tar.gz: 789cbc87df6997b5fbdb53db4b3fe7c91b6e899b995d232a227a2395c53f67c6
3
+ metadata.gz: f123b0f2bb611ea41bafe6a9cb5a7bb7564ba6be7044127e77fddf34dc3faed1
4
+ data.tar.gz: 87f0079575c62dac9a43d6900b88da45cce77a37f5b994376b0d31dd24b03fa5
5
5
  SHA512:
6
- metadata.gz: 4e811d34caebb96b1b1d0247ff1a547ed10e5e8a27d82df105c8a809a4b96011e0660805075e8864100d8df29545537c2dde3d3b742e5c35fae33e3a847ad5e9
7
- data.tar.gz: fa2f8c4039b009dfc746384dac40028956b880efa96d479485e85b702aad718f4b90e8b44b04912b8aa2bb94d54f27f50570978e2d07078943120bce206e38be
6
+ metadata.gz: b6554befde64315d01d2db11ba22b479629ae602ca4acddee51c941bd091144849ad15f2b98c220d29bd97c70c02fc2726061db429db2140a50bbfe0fce850a2
7
+ data.tar.gz: 511b7dccf5f93e4cf8a652d3039ec852c556cbd49e238c676029d1b0e611dce669ac84140c8b9be3f861426e8f248f6452698bf7319f5493cb7ecccb3cd1428d
@@ -4,6 +4,7 @@ require 'spidy/version'
4
4
  require 'active_support/all'
5
5
  require 'mechanize'
6
6
  require 'open-uri'
7
+ require 'socksify'
7
8
 
8
9
  #
9
10
  # web spider dsl engine
@@ -36,6 +37,9 @@ module Spidy
36
37
  spidy.instance_eval do
37
38
  undef :spider
38
39
  undef :define
40
+ undef :wait_time
41
+ undef :user_agent
42
+ undef :socks_proxy
39
43
  end
40
44
  spidy
41
45
  end
@@ -37,10 +37,6 @@ module Spidy::Binder
37
37
  @url = url
38
38
  end
39
39
 
40
- def scraper(name, source)
41
- lambda { |&block| @spidy.call(source, name: name, &block) }
42
- end
43
-
44
40
  def to_s
45
41
  to_h.to_json
46
42
  end
@@ -10,6 +10,24 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ #
14
+ # retry class
15
+ #
16
+ class Retry < StandardError
17
+ attr_reader :page
18
+ attr_reader :response_code
19
+ attr_reader :wait_time
20
+
21
+ def initialize(wait_time: 2, page: nil, error: nil)
22
+ @page = page
23
+ @wait_time = wait_time
24
+ @response_code = error.try(:response_code) || page.try(:response_code)
25
+ end
26
+ end
27
+
28
+ #
29
+ # default user agent
30
+ #
13
31
  USER_AGENT = [
14
32
  'Mozilla/5.0',
15
33
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -19,9 +37,19 @@ module Spidy::Connector
19
37
  'Safari/537.36'
20
38
  ].join(' ')
21
39
 
22
- def self.get(value)
23
- return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
40
+ #
41
+ # get connection handller
42
+ #
43
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
44
+ return value if value.respond_to?(:call)
45
+
46
+ connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
47
+ return connector if socks_proxy.nil?
24
48
 
25
- value
49
+ lambda do |url, &block|
50
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
51
+ connector.call(url, &block)
52
+ end
53
+ end
26
54
  end
27
55
  end
@@ -3,8 +3,11 @@
3
3
  #
4
4
  # Direct resource ( not network resource )
5
5
  #
6
- module Spidy::Connector::Direct
7
- def self.call(resource, &yielder)
6
+ class Spidy::Connector::Direct
7
+ def call(resource, &yielder)
8
8
  yielder.call(resource)
9
9
  end
10
+
11
+ def initialize(wait_time: nil, user_agent: nil)
12
+ end
10
13
  end
@@ -3,74 +3,60 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- module Spidy::Connector::Html
7
- #
8
- # retry class
9
- #
10
- class Retry < StandardError
11
- attr_reader :page
12
- attr_reader :response_code
13
- attr_reader :wait_time
14
-
15
- def initialize(wait_time: 2, page: nil, error: nil)
16
- @page = page
17
- @wait_time = wait_time
18
- @response_code = error.try(:response_code) || page.try(:response_code)
19
- end
6
+ class Spidy::Connector::Html
7
+ def initialize(wait_time:, user_agent:, logger: nil)
8
+ @wait_time = wait_time
9
+ @logger = logger || proc { |values| STDERR.puts(values.to_json) }
10
+ @agent = Mechanize.new
11
+ @user_agent = user_agent
12
+ @agent.user_agent = user_agent
20
13
  end
21
14
 
22
- @logger = proc { |values| STDERR.puts(values.to_json) }
23
- @agent = Mechanize.new
24
- @agent.user_agent = Spidy::Connector::USER_AGENT
25
-
26
- class << self
27
- attr_reader :agent
28
- attr_accessor :logger
15
+ attr_reader :agent
16
+ attr_reader :logger
29
17
 
30
- def call(url, encoding: nil, retry_count: 5, &yielder)
31
- fail 'url is not specified' if url.blank?
32
- if encoding
33
- agent.default_encoding = encoding
34
- agent.force_default_encoding = true
35
- end
36
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
37
- get(url, retry_count, yielder)
18
+ def call(url, encoding: nil, retry_count: 5, &yielder)
19
+ fail 'url is not specified' if url.blank?
20
+ if encoding
21
+ agent.default_encoding = encoding
22
+ agent.force_default_encoding = true
38
23
  end
24
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
25
+ get(url, retry_count, yielder)
26
+ end
39
27
 
40
- private
28
+ private
41
29
 
42
- def get(url, retry_count, yielder)
43
- connect(url, retry_count, yielder)
44
- rescue Retry => e
45
- logger.call('retry.accessed': Time.current,
46
- 'retry.uri': url,
47
- 'retry.response_code': e.response_code,
48
- 'retry.rest_count': retry_count)
30
+ def get(url, retry_count, yielder)
31
+ connect(url, retry_count, yielder)
32
+ rescue Spidy::Connector::Retry => e
33
+ logger.call('retry.accessed': Time.current,
34
+ 'retry.uri': url,
35
+ 'retry.response_code': e.response_code,
36
+ 'retry.rest_count': retry_count)
49
37
 
50
- @agent = Mechanize.new
51
- @agent.user_agent = Spidy::Connector::USER_AGENT
38
+ @agent = Mechanize.new
39
+ @agent.user_agent = @user_agent
52
40
 
53
- retry_count -= 1
54
- if retry_count.positive?
55
- sleep e.wait_time
56
- retry
57
- end
58
- raise e
41
+ retry_count -= 1
42
+ if retry_count.positive?
43
+ sleep e.wait_time
44
+ retry
59
45
  end
46
+ raise e
47
+ end
60
48
 
61
- def connect(url, retry_count, yielder)
62
- result = nil
63
- agent.get(url) do |page|
64
- fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
49
+ def connect(url, retry_count, yielder)
50
+ result = nil
51
+ agent.get(url) do |page|
52
+ fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
65
53
 
66
- result = yielder.call(page)
67
- end
68
- result
69
- rescue Mechanize::ResponseCodeError => e
70
- raise Retry, error: e if e.response_code == '429'
71
- raise Retry, error: e if e.response_code == '502'
72
- raise e
54
+ result = yielder.call(page)
73
55
  end
74
-
56
+ result
57
+ rescue Mechanize::ResponseCodeError => e
58
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
59
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
60
+ raise e
75
61
  end
76
62
  end
@@ -3,9 +3,17 @@
3
3
  #
4
4
  # OpenURI to JSON.parse
5
5
  #
6
- module Spidy::Connector::Json
7
- def self.call(url)
6
+ class Spidy::Connector::Json
7
+ def initialize(wait_time: nil, user_agent: nil)
8
+ @user_agent = user_agent
9
+ end
10
+
11
+ def call(url, &block)
8
12
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
13
+ connect(url, &block)
14
+ end
15
+
16
+ def connect(url)
17
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
10
18
  end
11
19
  end
@@ -3,12 +3,16 @@
3
3
  #
4
4
  # xml
5
5
  #
6
- module Spidy::Connector::Xml
7
- def self.call(url)
6
+ class Spidy::Connector::Xml
7
+ def call(url)
8
8
  fail 'URL is undefined' if url.blank?
9
9
 
10
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
10
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
11
11
  yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
12
  end
13
13
  end
14
+
15
+ def initialize(wait_time: nil, user_agent: nil)
16
+ @user_agent = user_agent
17
+ end
14
18
  end
@@ -18,4 +18,9 @@ class Spidy::Console
18
18
  def reload!
19
19
  @definition_file.eval_definition
20
20
  end
21
+
22
+ def connector(url, as:)
23
+ connector = Spidy::Connector.get(as)
24
+ connector.call(url) { |page| break page }
25
+ end
21
26
  end
@@ -16,6 +16,18 @@ module Spidy::Definition
16
16
  spidy.call(source, &yielder)
17
17
  end
18
18
 
19
+ def user_agent(user_agent)
20
+ @user_agent = user_agent
21
+ end
22
+
23
+ def wait_time(wait_time)
24
+ @wait_time = wait_time
25
+ end
26
+
27
+ def socks_proxy(host, port)
28
+ @socks_proxy = { host: host, port: port }
29
+ end
30
+
19
31
  def each(source = nil, name: :default, &yielder)
20
32
  name = name.presence || :default
21
33
  spidy = @namespace[:"#{name}_spider"]
@@ -26,7 +38,7 @@ module Spidy::Definition
26
38
 
27
39
  def spider(name = :default, connector: nil, as: nil, &define_block)
28
40
  @namespace ||= {}
29
- connector = Spidy::Connector.get(connector || as)
41
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
30
42
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
43
  define_block.call(yielder, connector, source)
32
44
  end
@@ -34,7 +46,7 @@ module Spidy::Definition
34
46
 
35
47
  def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
36
48
  @namespace ||= {}
37
- connector = Spidy::Connector.get(connector || as)
49
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
38
50
  binder = Spidy::Binder.get(self, binder || as)
39
51
  @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
40
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.4'
4
+ VERSION = '0.2.5'
5
5
  end
@@ -33,5 +33,6 @@ Gem::Specification.new do |spec|
33
33
 
34
34
  spec.add_runtime_dependency 'activesupport'
35
35
  spec.add_runtime_dependency 'mechanize'
36
+ spec.add_runtime_dependency 'socksify'
36
37
  spec.add_runtime_dependency 'pry'
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-31 00:00:00.000000000 Z
11
+ date: 2020-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: socksify
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: pry
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -136,7 +150,7 @@ dependencies:
136
150
  - - ">="
137
151
  - !ruby/object:Gem::Version
138
152
  version: '0'
139
- description:
153
+ description:
140
154
  email:
141
155
  - aileron.cc@gmail.com
142
156
  executables:
@@ -183,7 +197,7 @@ homepage: https://github.com/aileron-inc/spidy
183
197
  licenses:
184
198
  - MIT
185
199
  metadata: {}
186
- post_install_message:
200
+ post_install_message:
187
201
  rdoc_options: []
188
202
  require_paths:
189
203
  - lib
@@ -199,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
199
213
  version: '0'
200
214
  requirements: []
201
215
  rubygems_version: 3.0.3
202
- signing_key:
216
+ signing_key:
203
217
  specification_version: 4
204
218
  summary: web spider dsl
205
219
  test_files: []