spidy 0.2.4 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d95a2c20a93c9c60969c4a738200391443f2e797273988d7aaa9a7abc812350
4
- data.tar.gz: 789cbc87df6997b5fbdb53db4b3fe7c91b6e899b995d232a227a2395c53f67c6
3
+ metadata.gz: 44d16509821e1779d821effb7c571d2a11cbb79166168943242628b2e53457ec
4
+ data.tar.gz: 5b52389a8042c9e69aaeead0d96b81553da2e5f5e51ad16696f87249e38067bd
5
5
  SHA512:
6
- metadata.gz: 4e811d34caebb96b1b1d0247ff1a547ed10e5e8a27d82df105c8a809a4b96011e0660805075e8864100d8df29545537c2dde3d3b742e5c35fae33e3a847ad5e9
7
- data.tar.gz: fa2f8c4039b009dfc746384dac40028956b880efa96d479485e85b702aad718f4b90e8b44b04912b8aa2bb94d54f27f50570978e2d07078943120bce206e38be
6
+ metadata.gz: 3475b05ea1235b388960416a03ab96adb64608ce4046d1812b4d80f038a91b99343882e45413ee6a2f74e29626869aca8a48d34f8e57c64b480c8893efcda123
7
+ data.tar.gz: 7e4fd1abf5898a7a7273d4eaf53021e4c1898d995e8e0d23abfcbf80c3e1b45b2f059c95e48707577bae72a4f158d7cba7f9be097e7e8a277d89e683386c32d6
@@ -1,10 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.2.3)
4
+ spidy (0.2.9)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
8
+ socksify
9
+ tor
8
10
 
9
11
  GEM
10
12
  remote: https://rubygems.org/
@@ -16,7 +18,7 @@ GEM
16
18
  tzinfo (~> 1.1)
17
19
  zeitwerk (~> 2.2, >= 2.2.2)
18
20
  coderay (1.1.2)
19
- concurrent-ruby (1.1.6)
21
+ concurrent-ruby (1.1.7)
20
22
  connection_pool (2.2.3)
21
23
  diff-lcs (1.3)
22
24
  domain_name (0.5.20190701)
@@ -40,7 +42,7 @@ GEM
40
42
  mime-types-data (~> 3.2015)
41
43
  mime-types-data (3.2020.0512)
42
44
  mini_portile2 (2.4.0)
43
- minitest (5.14.1)
45
+ minitest (5.14.2)
44
46
  mixlib-shellout (2.4.4)
45
47
  net-http-digest_auth (1.4.1)
46
48
  net-http-persistent (4.0.0)
@@ -72,7 +74,9 @@ GEM
72
74
  diff-lcs (>= 1.2.0, < 2.0)
73
75
  rspec-support (~> 3.8.0)
74
76
  rspec-support (3.8.2)
77
+ socksify (1.7.1)
75
78
  thread_safe (0.3.6)
79
+ tor (0.1.4)
76
80
  tzinfo (1.2.7)
77
81
  thread_safe (~> 0.1)
78
82
  unf (0.1.4)
@@ -4,6 +4,8 @@ require 'spidy/version'
4
4
  require 'active_support/all'
5
5
  require 'mechanize'
6
6
  require 'open-uri'
7
+ require 'socksify'
8
+ require 'tor'
7
9
 
8
10
  #
9
11
  # web spider dsl engine
@@ -36,6 +38,9 @@ module Spidy
36
38
  spidy.instance_eval do
37
39
  undef :spider
38
40
  undef :define
41
+ undef :wait_time
42
+ undef :user_agent
43
+ undef :socks_proxy
39
44
  end
40
45
  spidy
41
46
  end
@@ -37,10 +37,6 @@ module Spidy::Binder
37
37
  @url = url
38
38
  end
39
39
 
40
- def scraper(name, source)
41
- lambda { |&block| @spidy.call(source, name: name, &block) }
42
- end
43
-
44
40
  def to_s
45
41
  to_h.to_json
46
42
  end
@@ -10,6 +10,9 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ #
14
+ # default user agent
15
+ #
13
16
  USER_AGENT = [
14
17
  'Mozilla/5.0',
15
18
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -19,9 +22,77 @@ module Spidy::Connector
19
22
  'Safari/537.36'
20
23
  ].join(' ')
21
24
 
22
- def self.get(value)
23
- return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
25
+ #
26
+ # error output logger
27
+ #
28
+ DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
24
29
 
25
- value
30
+ #
31
+ # static method
32
+ #
33
+ module StaticAccessor
34
+ extend ActiveSupport::Concern
35
+ class_methods do
36
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
37
+ new(wait_time: wait_time, user_agent: user_agent, logger: logger).call(url, &block)
38
+ end
39
+ end
40
+ end
41
+
42
+ #
43
+ # retry class
44
+ #
45
+ class Retry < StandardError
46
+ attr_reader :page
47
+ attr_reader :response_code
48
+ attr_reader :wait_time
49
+
50
+ def initialize(wait_time: 2, page: nil, error: nil)
51
+ @page = page
52
+ @wait_time = wait_time
53
+ @response_code = error.try(:response_code) || page.try(:response_code)
54
+ end
55
+ end
56
+
57
+ class Builder
58
+ attr_reader :origin_connector, :proxy_connector
59
+
60
+ def initialize(connector, socks_proxy)
61
+ @socks_proxy = socks_proxy
62
+ @origin_connector = connector
63
+ @proxy_connector =
64
+ lambda do |url, &block|
65
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
66
+ connector.call(url, &block)
67
+ end
68
+ end
69
+ end
70
+
71
+ def proxy_disabled?
72
+ !tor?
73
+ end
74
+
75
+ def tor?
76
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
77
+ true
78
+ rescue Errno::ECONNREFUSED
79
+ false
80
+ end
81
+ end
82
+
83
+ #
84
+ # get connection handller
85
+ #
86
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
87
+ return value if value.respond_to?(:call)
88
+
89
+ builder = Builder.new(const_get(value.to_s.classify).new(
90
+ wait_time: wait_time || 5,
91
+ user_agent: user_agent || USER_AGENT,
92
+ logger: logger || DEFAULT_LOGGER,
93
+ ), socks_proxy)
94
+ return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
95
+
96
+ builder.proxy_connector
26
97
  end
27
98
  end
@@ -3,8 +3,11 @@
3
3
  #
4
4
  # Direct resource ( not network resource )
5
5
  #
6
- module Spidy::Connector::Direct
7
- def self.call(resource, &yielder)
6
+ class Spidy::Connector::Direct
7
+ def call(resource, &yielder)
8
8
  yielder.call(resource)
9
9
  end
10
+
11
+ def initialize(wait_time: nil, user_agent: nil, logger: nil)
12
+ end
10
13
  end
@@ -3,74 +3,62 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- module Spidy::Connector::Html
7
- #
8
- # retry class
9
- #
10
- class Retry < StandardError
11
- attr_reader :page
12
- attr_reader :response_code
13
- attr_reader :wait_time
6
+ class Spidy::Connector::Html
7
+ include Spidy::Connector::StaticAccessor
14
8
 
15
- def initialize(wait_time: 2, page: nil, error: nil)
16
- @page = page
17
- @wait_time = wait_time
18
- @response_code = error.try(:response_code) || page.try(:response_code)
19
- end
9
+ def initialize(wait_time:, user_agent:, logger: nil)
10
+ @wait_time = wait_time
11
+ @logger = logger
12
+ @agent = Mechanize.new
13
+ @user_agent = user_agent
14
+ @agent.user_agent = user_agent
20
15
  end
21
16
 
22
- @logger = proc { |values| STDERR.puts(values.to_json) }
23
- @agent = Mechanize.new
24
- @agent.user_agent = Spidy::Connector::USER_AGENT
25
-
26
- class << self
27
- attr_reader :agent
28
- attr_accessor :logger
17
+ attr_reader :agent
18
+ attr_reader :logger
29
19
 
30
- def call(url, encoding: nil, retry_count: 5, &yielder)
31
- fail 'url is not specified' if url.blank?
32
- if encoding
33
- agent.default_encoding = encoding
34
- agent.force_default_encoding = true
35
- end
36
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
37
- get(url, retry_count, yielder)
20
+ def call(url, encoding: nil, retry_count: 5, &yielder)
21
+ fail 'url is not specified' if url.blank?
22
+ if encoding
23
+ agent.default_encoding = encoding
24
+ agent.force_default_encoding = true
38
25
  end
26
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
+ get(url, retry_count, yielder)
28
+ end
39
29
 
40
- private
30
+ private
41
31
 
42
- def get(url, retry_count, yielder)
43
- connect(url, retry_count, yielder)
44
- rescue Retry => e
45
- logger.call('retry.accessed': Time.current,
46
- 'retry.uri': url,
47
- 'retry.response_code': e.response_code,
48
- 'retry.rest_count': retry_count)
32
+ def get(url, retry_count, yielder)
33
+ connect(url, retry_count, yielder)
34
+ rescue Spidy::Connector::Retry => e
35
+ logger.call('retry.accessed': Time.current,
36
+ 'retry.uri': url,
37
+ 'retry.response_code': e.response_code,
38
+ 'retry.rest_count': retry_count)
49
39
 
50
- @agent = Mechanize.new
51
- @agent.user_agent = Spidy::Connector::USER_AGENT
40
+ @agent = Mechanize.new
41
+ @agent.user_agent = @user_agent
52
42
 
53
- retry_count -= 1
54
- if retry_count.positive?
55
- sleep e.wait_time
56
- retry
57
- end
58
- raise e
43
+ retry_count -= 1
44
+ if retry_count.positive?
45
+ sleep e.wait_time
46
+ retry
59
47
  end
48
+ raise e
49
+ end
60
50
 
61
- def connect(url, retry_count, yielder)
62
- result = nil
63
- agent.get(url) do |page|
64
- fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
51
+ def connect(url, retry_count, yielder)
52
+ result = nil
53
+ agent.get(url) do |page|
54
+ fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
65
55
 
66
- result = yielder.call(page)
67
- end
68
- result
69
- rescue Mechanize::ResponseCodeError => e
70
- raise Retry, error: e if e.response_code == '429'
71
- raise Retry, error: e if e.response_code == '502'
72
- raise e
56
+ result = yielder.call(page)
73
57
  end
74
-
58
+ result
59
+ rescue Mechanize::ResponseCodeError => e
60
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
+ raise e
75
63
  end
76
64
  end
@@ -3,9 +3,35 @@
3
3
  #
4
4
  # OpenURI to JSON.parse
5
5
  #
6
- module Spidy::Connector::Json
7
- def self.call(url)
6
+ class Spidy::Connector::Json
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ attr_reader :logger
10
+
11
+ def initialize(wait_time: nil, user_agent: nil, logger: nil)
12
+ @wait_time = wait_time
13
+ @user_agent = user_agent
14
+ @logger = logger
15
+ end
16
+
17
+ def call(url, &block)
8
18
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
19
+ connect(url, &block)
20
+ end
21
+
22
+ def connect(url, retry_count: 5)
23
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
24
+ rescue OpenURI::HTTPError => e
25
+ logger.call('retry.accessed': Time.current,
26
+ 'retry.uri': url,
27
+ 'retry.response_code': e.message,
28
+ 'retry.rest_count': retry_count)
29
+
30
+ retry_count -= 1
31
+ if retry_count.positive?
32
+ sleep @wait_time
33
+ retry
34
+ end
35
+ raise e
10
36
  end
11
37
  end
@@ -3,12 +3,18 @@
3
3
  #
4
4
  # xml
5
5
  #
6
- module Spidy::Connector::Xml
7
- def self.call(url)
6
+ class Spidy::Connector::Xml
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ def call(url)
8
10
  fail 'URL is undefined' if url.blank?
9
11
 
10
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
12
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
11
13
  yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
14
  end
13
15
  end
16
+
17
+ def initialize(user_agent: nil, logger: nil)
18
+ @user_agent = user_agent
19
+ end
14
20
  end
@@ -18,4 +18,9 @@ class Spidy::Console
18
18
  def reload!
19
19
  @definition_file.eval_definition
20
20
  end
21
+
22
+ def connector(url, as:)
23
+ connector = Spidy::Connector.get(as)
24
+ connector.call(url) { |page| break page }
25
+ end
21
26
  end
@@ -16,6 +16,18 @@ module Spidy::Definition
16
16
  spidy.call(source, &yielder)
17
17
  end
18
18
 
19
+ def user_agent(user_agent)
20
+ @user_agent = user_agent
21
+ end
22
+
23
+ def wait_time(wait_time)
24
+ @wait_time = wait_time
25
+ end
26
+
27
+ def socks_proxy(host, port)
28
+ @socks_proxy = { host: host, port: port }
29
+ end
30
+
19
31
  def each(source = nil, name: :default, &yielder)
20
32
  name = name.presence || :default
21
33
  spidy = @namespace[:"#{name}_spider"]
@@ -26,7 +38,7 @@ module Spidy::Definition
26
38
 
27
39
  def spider(name = :default, connector: nil, as: nil, &define_block)
28
40
  @namespace ||= {}
29
- connector = Spidy::Connector.get(connector || as)
41
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
30
42
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
43
  define_block.call(yielder, connector, source)
32
44
  end
@@ -34,7 +46,7 @@ module Spidy::Definition
34
46
 
35
47
  def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
36
48
  @namespace ||= {}
37
- connector = Spidy::Connector.get(connector || as)
49
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
38
50
  binder = Spidy::Binder.get(self, binder || as)
39
51
  @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
40
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.4'
4
+ VERSION = '0.2.9'
5
5
  end
@@ -31,7 +31,9 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency 'ffaker'
32
32
  spec.add_development_dependency 'rspec-command'
33
33
 
34
+ spec.add_runtime_dependency 'tor'
34
35
  spec.add_runtime_dependency 'activesupport'
35
36
  spec.add_runtime_dependency 'mechanize'
37
+ spec.add_runtime_dependency 'socksify'
36
38
  spec.add_runtime_dependency 'pry'
37
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-31 00:00:00.000000000 Z
11
+ date: 2020-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tor
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: activesupport
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,20 @@ dependencies:
122
136
  - - ">="
123
137
  - !ruby/object:Gem::Version
124
138
  version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: socksify
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  - !ruby/object:Gem::Dependency
126
154
  name: pry
127
155
  requirement: !ruby/object:Gem::Requirement
@@ -136,7 +164,7 @@ dependencies:
136
164
  - - ">="
137
165
  - !ruby/object:Gem::Version
138
166
  version: '0'
139
- description:
167
+ description:
140
168
  email:
141
169
  - aileron.cc@gmail.com
142
170
  executables:
@@ -183,7 +211,7 @@ homepage: https://github.com/aileron-inc/spidy
183
211
  licenses:
184
212
  - MIT
185
213
  metadata: {}
186
- post_install_message:
214
+ post_install_message:
187
215
  rdoc_options: []
188
216
  require_paths:
189
217
  - lib
@@ -199,7 +227,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
199
227
  version: '0'
200
228
  requirements: []
201
229
  rubygems_version: 3.0.3
202
- signing_key:
230
+ signing_key:
203
231
  specification_version: 4
204
232
  summary: web spider dsl
205
233
  test_files: []