spidy 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d95a2c20a93c9c60969c4a738200391443f2e797273988d7aaa9a7abc812350
4
- data.tar.gz: 789cbc87df6997b5fbdb53db4b3fe7c91b6e899b995d232a227a2395c53f67c6
3
+ metadata.gz: f123b0f2bb611ea41bafe6a9cb5a7bb7564ba6be7044127e77fddf34dc3faed1
4
+ data.tar.gz: 87f0079575c62dac9a43d6900b88da45cce77a37f5b994376b0d31dd24b03fa5
5
5
  SHA512:
6
- metadata.gz: 4e811d34caebb96b1b1d0247ff1a547ed10e5e8a27d82df105c8a809a4b96011e0660805075e8864100d8df29545537c2dde3d3b742e5c35fae33e3a847ad5e9
7
- data.tar.gz: fa2f8c4039b009dfc746384dac40028956b880efa96d479485e85b702aad718f4b90e8b44b04912b8aa2bb94d54f27f50570978e2d07078943120bce206e38be
6
+ metadata.gz: b6554befde64315d01d2db11ba22b479629ae602ca4acddee51c941bd091144849ad15f2b98c220d29bd97c70c02fc2726061db429db2140a50bbfe0fce850a2
7
+ data.tar.gz: 511b7dccf5f93e4cf8a652d3039ec852c556cbd49e238c676029d1b0e611dce669ac84140c8b9be3f861426e8f248f6452698bf7319f5493cb7ecccb3cd1428d
@@ -4,6 +4,7 @@ require 'spidy/version'
4
4
  require 'active_support/all'
5
5
  require 'mechanize'
6
6
  require 'open-uri'
7
+ require 'socksify'
7
8
 
8
9
  #
9
10
  # web spider dsl engine
@@ -36,6 +37,9 @@ module Spidy
36
37
  spidy.instance_eval do
37
38
  undef :spider
38
39
  undef :define
40
+ undef :wait_time
41
+ undef :user_agent
42
+ undef :socks_proxy
39
43
  end
40
44
  spidy
41
45
  end
@@ -37,10 +37,6 @@ module Spidy::Binder
37
37
  @url = url
38
38
  end
39
39
 
40
- def scraper(name, source)
41
- lambda { |&block| @spidy.call(source, name: name, &block) }
42
- end
43
-
44
40
  def to_s
45
41
  to_h.to_json
46
42
  end
@@ -10,6 +10,24 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ #
14
+ # retry class
15
+ #
16
+ class Retry < StandardError
17
+ attr_reader :page
18
+ attr_reader :response_code
19
+ attr_reader :wait_time
20
+
21
+ def initialize(wait_time: 2, page: nil, error: nil)
22
+ @page = page
23
+ @wait_time = wait_time
24
+ @response_code = error.try(:response_code) || page.try(:response_code)
25
+ end
26
+ end
27
+
28
+ #
29
+ # default user agent
30
+ #
13
31
  USER_AGENT = [
14
32
  'Mozilla/5.0',
15
33
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -19,9 +37,19 @@ module Spidy::Connector
19
37
  'Safari/537.36'
20
38
  ].join(' ')
21
39
 
22
- def self.get(value)
23
- return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
40
+ #
41
+ # get connection handller
42
+ #
43
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
44
+ return value if value.respond_to?(:call)
45
+
46
+ connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
47
+ return connector if socks_proxy.nil?
24
48
 
25
- value
49
+ lambda do |url, &block|
50
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
51
+ connector.call(url, &block)
52
+ end
53
+ end
26
54
  end
27
55
  end
@@ -3,8 +3,11 @@
3
3
  #
4
4
  # Direct resource ( not network resource )
5
5
  #
6
- module Spidy::Connector::Direct
7
- def self.call(resource, &yielder)
6
+ class Spidy::Connector::Direct
7
+ def call(resource, &yielder)
8
8
  yielder.call(resource)
9
9
  end
10
+
11
+ def initialize(wait_time: nil, user_agent: nil)
12
+ end
10
13
  end
@@ -3,74 +3,60 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- module Spidy::Connector::Html
7
- #
8
- # retry class
9
- #
10
- class Retry < StandardError
11
- attr_reader :page
12
- attr_reader :response_code
13
- attr_reader :wait_time
14
-
15
- def initialize(wait_time: 2, page: nil, error: nil)
16
- @page = page
17
- @wait_time = wait_time
18
- @response_code = error.try(:response_code) || page.try(:response_code)
19
- end
6
+ class Spidy::Connector::Html
7
+ def initialize(wait_time:, user_agent:, logger: nil)
8
+ @wait_time = wait_time
9
+ @logger = logger || proc { |values| STDERR.puts(values.to_json) }
10
+ @agent = Mechanize.new
11
+ @user_agent = user_agent
12
+ @agent.user_agent = user_agent
20
13
  end
21
14
 
22
- @logger = proc { |values| STDERR.puts(values.to_json) }
23
- @agent = Mechanize.new
24
- @agent.user_agent = Spidy::Connector::USER_AGENT
25
-
26
- class << self
27
- attr_reader :agent
28
- attr_accessor :logger
15
+ attr_reader :agent
16
+ attr_reader :logger
29
17
 
30
- def call(url, encoding: nil, retry_count: 5, &yielder)
31
- fail 'url is not specified' if url.blank?
32
- if encoding
33
- agent.default_encoding = encoding
34
- agent.force_default_encoding = true
35
- end
36
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
37
- get(url, retry_count, yielder)
18
+ def call(url, encoding: nil, retry_count: 5, &yielder)
19
+ fail 'url is not specified' if url.blank?
20
+ if encoding
21
+ agent.default_encoding = encoding
22
+ agent.force_default_encoding = true
38
23
  end
24
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
25
+ get(url, retry_count, yielder)
26
+ end
39
27
 
40
- private
28
+ private
41
29
 
42
- def get(url, retry_count, yielder)
43
- connect(url, retry_count, yielder)
44
- rescue Retry => e
45
- logger.call('retry.accessed': Time.current,
46
- 'retry.uri': url,
47
- 'retry.response_code': e.response_code,
48
- 'retry.rest_count': retry_count)
30
+ def get(url, retry_count, yielder)
31
+ connect(url, retry_count, yielder)
32
+ rescue Spidy::Connector::Retry => e
33
+ logger.call('retry.accessed': Time.current,
34
+ 'retry.uri': url,
35
+ 'retry.response_code': e.response_code,
36
+ 'retry.rest_count': retry_count)
49
37
 
50
- @agent = Mechanize.new
51
- @agent.user_agent = Spidy::Connector::USER_AGENT
38
+ @agent = Mechanize.new
39
+ @agent.user_agent = @user_agent
52
40
 
53
- retry_count -= 1
54
- if retry_count.positive?
55
- sleep e.wait_time
56
- retry
57
- end
58
- raise e
41
+ retry_count -= 1
42
+ if retry_count.positive?
43
+ sleep e.wait_time
44
+ retry
59
45
  end
46
+ raise e
47
+ end
60
48
 
61
- def connect(url, retry_count, yielder)
62
- result = nil
63
- agent.get(url) do |page|
64
- fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
49
+ def connect(url, retry_count, yielder)
50
+ result = nil
51
+ agent.get(url) do |page|
52
+ fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
65
53
 
66
- result = yielder.call(page)
67
- end
68
- result
69
- rescue Mechanize::ResponseCodeError => e
70
- raise Retry, error: e if e.response_code == '429'
71
- raise Retry, error: e if e.response_code == '502'
72
- raise e
54
+ result = yielder.call(page)
73
55
  end
74
-
56
+ result
57
+ rescue Mechanize::ResponseCodeError => e
58
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
59
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
60
+ raise e
75
61
  end
76
62
  end
@@ -3,9 +3,17 @@
3
3
  #
4
4
  # OpenURI to JSON.parse
5
5
  #
6
- module Spidy::Connector::Json
7
- def self.call(url)
6
+ class Spidy::Connector::Json
7
+ def initialize(wait_time: nil, user_agent: nil)
8
+ @user_agent = user_agent
9
+ end
10
+
11
+ def call(url, &block)
8
12
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
13
+ connect(url, &block)
14
+ end
15
+
16
+ def connect(url)
17
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
10
18
  end
11
19
  end
@@ -3,12 +3,16 @@
3
3
  #
4
4
  # xml
5
5
  #
6
- module Spidy::Connector::Xml
7
- def self.call(url)
6
+ class Spidy::Connector::Xml
7
+ def call(url)
8
8
  fail 'URL is undefined' if url.blank?
9
9
 
10
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
10
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
11
11
  yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
12
  end
13
13
  end
14
+
15
+ def initialize(wait_time: nil, user_agent: nil)
16
+ @user_agent = user_agent
17
+ end
14
18
  end
@@ -18,4 +18,9 @@ class Spidy::Console
18
18
  def reload!
19
19
  @definition_file.eval_definition
20
20
  end
21
+
22
+ def connector(url, as:)
23
+ connector = Spidy::Connector.get(as)
24
+ connector.call(url) { |page| break page }
25
+ end
21
26
  end
@@ -16,6 +16,18 @@ module Spidy::Definition
16
16
  spidy.call(source, &yielder)
17
17
  end
18
18
 
19
+ def user_agent(user_agent)
20
+ @user_agent = user_agent
21
+ end
22
+
23
+ def wait_time(wait_time)
24
+ @wait_time = wait_time
25
+ end
26
+
27
+ def socks_proxy(host, port)
28
+ @socks_proxy = { host: host, port: port }
29
+ end
30
+
19
31
  def each(source = nil, name: :default, &yielder)
20
32
  name = name.presence || :default
21
33
  spidy = @namespace[:"#{name}_spider"]
@@ -26,7 +38,7 @@ module Spidy::Definition
26
38
 
27
39
  def spider(name = :default, connector: nil, as: nil, &define_block)
28
40
  @namespace ||= {}
29
- connector = Spidy::Connector.get(connector || as)
41
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
30
42
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
43
  define_block.call(yielder, connector, source)
32
44
  end
@@ -34,7 +46,7 @@ module Spidy::Definition
34
46
 
35
47
  def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
36
48
  @namespace ||= {}
37
- connector = Spidy::Connector.get(connector || as)
49
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
38
50
  binder = Spidy::Binder.get(self, binder || as)
39
51
  @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
40
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.4'
4
+ VERSION = '0.2.5'
5
5
  end
@@ -33,5 +33,6 @@ Gem::Specification.new do |spec|
33
33
 
34
34
  spec.add_runtime_dependency 'activesupport'
35
35
  spec.add_runtime_dependency 'mechanize'
36
+ spec.add_runtime_dependency 'socksify'
36
37
  spec.add_runtime_dependency 'pry'
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-31 00:00:00.000000000 Z
11
+ date: 2020-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: socksify
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: pry
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -136,7 +150,7 @@ dependencies:
136
150
  - - ">="
137
151
  - !ruby/object:Gem::Version
138
152
  version: '0'
139
- description:
153
+ description:
140
154
  email:
141
155
  - aileron.cc@gmail.com
142
156
  executables:
@@ -183,7 +197,7 @@ homepage: https://github.com/aileron-inc/spidy
183
197
  licenses:
184
198
  - MIT
185
199
  metadata: {}
186
- post_install_message:
200
+ post_install_message:
187
201
  rdoc_options: []
188
202
  require_paths:
189
203
  - lib
@@ -199,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
199
213
  version: '0'
200
214
  requirements: []
201
215
  rubygems_version: 3.0.3
202
- signing_key:
216
+ signing_key:
203
217
  specification_version: 4
204
218
  summary: web spider dsl
205
219
  test_files: []