spidy 0.2.2 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c032c78b7580bada781db301697b264b337d7c829bc5df4c3810a95770f56ee
4
- data.tar.gz: 4b282b06cc91b3fcdef8187381ded42d01a689720ebfede4fb135b1639d47f11
3
+ metadata.gz: 177e6b5e3b4f864251d7071d23b2b42b219d8193673a5b57ee9b376e393092a9
4
+ data.tar.gz: 92b39a277d752eb5b1944f822702eae803356f7f5e3211b1f91e2d9ad635090e
5
5
  SHA512:
6
- metadata.gz: de20181e766b5b8d9189e657bfa20f4a4720e2c0a0adb7ece522309433ef5d93325d98d63a1b31304bd9a2575391318c3330c01f66356ae4110b7005c6560253
7
- data.tar.gz: 5cf49000bb2f234c111216f9a554267df18a360c4b0511685c000b8e22312feaf3ba5b0b53b56612f0653157cd51e1da46e6140d7e4902b8a4a062894256b1d1
6
+ metadata.gz: b4078e3d1ad9813b0273f267d99bf7a9bf4473bb86b6cfa4fa283e6c5b060fa248e0b6fb517bede448f94e0dcebbb90ee71ef4eeb4bd5a45b1405c50005985fe
7
+ data.tar.gz: 00aa2cd512cf3a083082227234f59148d86e8ee0e4f602e07d1c67cbf9f3e8160b9144ebd29f3a67efded607c1705c2ce3008df235325afc7f3a715a6dd7527d
@@ -1 +1 @@
1
- 2.6.5
1
+ 2.6.6
@@ -1,30 +1,32 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.1.6)
4
+ spidy (0.2.6)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
8
+ socksify
9
+ tor
8
10
 
9
11
  GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
- activesupport (6.0.2.1)
14
+ activesupport (6.0.3.2)
13
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
14
16
  i18n (>= 0.7, < 2)
15
17
  minitest (~> 5.1)
16
18
  tzinfo (~> 1.1)
17
- zeitwerk (~> 2.2)
19
+ zeitwerk (~> 2.2, >= 2.2.2)
18
20
  coderay (1.1.2)
19
- concurrent-ruby (1.1.5)
20
- connection_pool (2.2.2)
21
+ concurrent-ruby (1.1.7)
22
+ connection_pool (2.2.3)
21
23
  diff-lcs (1.3)
22
24
  domain_name (0.5.20190701)
23
25
  unf (>= 0.0.5, < 1.0.0)
24
26
  ffaker (2.10.0)
25
27
  http-cookie (1.0.3)
26
28
  domain_name (~> 0.5)
27
- i18n (1.8.2)
29
+ i18n (1.8.5)
28
30
  concurrent-ruby (~> 1.0)
29
31
  mechanize (2.7.6)
30
32
  domain_name (~> 0.5, >= 0.5.1)
@@ -38,14 +40,14 @@ GEM
38
40
  method_source (0.9.2)
39
41
  mime-types (3.3.1)
40
42
  mime-types-data (~> 3.2015)
41
- mime-types-data (3.2019.1009)
43
+ mime-types-data (3.2020.0512)
42
44
  mini_portile2 (2.4.0)
43
- minitest (5.14.0)
45
+ minitest (5.14.2)
44
46
  mixlib-shellout (2.4.4)
45
47
  net-http-digest_auth (1.4.1)
46
- net-http-persistent (3.1.0)
48
+ net-http-persistent (4.0.0)
47
49
  connection_pool (~> 2.2)
48
- nokogiri (1.10.7)
50
+ nokogiri (1.10.10)
49
51
  mini_portile2 (~> 2.4.0)
50
52
  ntlm-http (0.1.1)
51
53
  pry (0.12.2)
@@ -72,14 +74,16 @@ GEM
72
74
  diff-lcs (>= 1.2.0, < 2.0)
73
75
  rspec-support (~> 3.8.0)
74
76
  rspec-support (3.8.2)
77
+ socksify (1.7.1)
75
78
  thread_safe (0.3.6)
76
- tzinfo (1.2.6)
79
+ tor (0.1.4)
80
+ tzinfo (1.2.7)
77
81
  thread_safe (~> 0.1)
78
82
  unf (0.1.4)
79
83
  unf_ext
80
- unf_ext (0.0.7.6)
84
+ unf_ext (0.0.7.7)
81
85
  webrobots (0.1.2)
82
- zeitwerk (2.2.2)
86
+ zeitwerk (2.4.0)
83
87
 
84
88
  PLATFORMS
85
89
  ruby
@@ -94,4 +98,4 @@ DEPENDENCIES
94
98
  spidy!
95
99
 
96
100
  BUNDLED WITH
97
- 2.1.2
101
+ 2.1.4
data/README.md CHANGED
@@ -60,7 +60,7 @@ call('http://example.com') { |html| break html } # html as nokogiri object ( mec
60
60
  ```
61
61
 
62
62
  ### When used from the ruby code
63
- ``
63
+ ```rb
64
64
  a = Spidy.define do
65
65
  # Implementing spiders and scrapers
66
66
  end
@@ -4,13 +4,14 @@ require 'spidy/version'
4
4
  require 'active_support/all'
5
5
  require 'mechanize'
6
6
  require 'open-uri'
7
+ require 'socksify'
8
+ require 'tor'
7
9
 
8
10
  #
9
11
  # web spider dsl engine
10
12
  #
11
13
  module Spidy
12
14
  extend ActiveSupport::Autoload
13
- autoload :Interface
14
15
  autoload :Shell
15
16
  autoload :CommandLine
16
17
  autoload :Console
@@ -34,6 +35,13 @@ module Spidy
34
35
  module_eval(&block)
35
36
  end
36
37
  end
37
- Spidy::Interface.new(spidy)
38
+ spidy.instance_eval do
39
+ undef :spider
40
+ undef :define
41
+ undef :wait_time
42
+ undef :user_agent
43
+ undef :socks_proxy
44
+ end
45
+ spidy
38
46
  end
39
47
  end
@@ -37,10 +37,6 @@ module Spidy::Binder
37
37
  @url = url
38
38
  end
39
39
 
40
- def scraper(name, source)
41
- lambda { |&block| @spidy.call(source, name: name, &block) }
42
- end
43
-
44
40
  def to_s
45
41
  to_h.to_json
46
42
  end
@@ -10,6 +10,33 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ module StaticAccessor
14
+ extend ActiveSupport::Concern
15
+ class_methods do
16
+ def call(url, wait_time: nil, user_agent: Spidy::Connector::USER_AGENT, &block)
17
+ new(wait_time: wait_time, user_agent: user_agent).call(url, &block)
18
+ end
19
+ end
20
+ end
21
+
22
+ #
23
+ # retry class
24
+ #
25
+ class Retry < StandardError
26
+ attr_reader :page
27
+ attr_reader :response_code
28
+ attr_reader :wait_time
29
+
30
+ def initialize(wait_time: 2, page: nil, error: nil)
31
+ @page = page
32
+ @wait_time = wait_time
33
+ @response_code = error.try(:response_code) || page.try(:response_code)
34
+ end
35
+ end
36
+
37
+ #
38
+ # default user agent
39
+ #
13
40
  USER_AGENT = [
14
41
  'Mozilla/5.0',
15
42
  '(Macintosh; Intel Mac OS X 10_12_6)',
@@ -19,9 +46,19 @@ module Spidy::Connector
19
46
  'Safari/537.36'
20
47
  ].join(' ')
21
48
 
22
- def self.get(value)
23
- return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
49
+ #
50
+ # get connection handller
51
+ #
52
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
53
+ return value if value.respond_to?(:call)
54
+
55
+ connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
56
+ return connector if socks_proxy.nil?
24
57
 
25
- value
58
+ lambda do |url, &block|
59
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
60
+ connector.call(url, &block)
61
+ end
62
+ end
26
63
  end
27
64
  end
@@ -3,8 +3,11 @@
3
3
  #
4
4
  # Direct resource ( not network resource )
5
5
  #
6
- module Spidy::Connector::Direct
7
- def self.call(resource, &yielder)
6
+ class Spidy::Connector::Direct
7
+ def call(resource, &yielder)
8
8
  yielder.call(resource)
9
9
  end
10
+
11
+ def initialize(wait_time: nil, user_agent: nil)
12
+ end
10
13
  end
@@ -3,73 +3,62 @@
3
3
  #
4
4
  # Mechanize wrapper
5
5
  #
6
- module Spidy::Connector::Html
7
- #
8
- # retry class
9
- #
10
- class Retry < StandardError
11
- attr_reader :page
12
- attr_reader :response_code
13
- attr_reader :wait_time
6
+ class Spidy::Connector::Html
7
+ include Spidy::Connector::StaticAccessor
14
8
 
15
- def initialize(wait_time: 2, page: nil, error: nil)
16
- @page = page
17
- @wait_time = wait_time
18
- @response_code = error.try(:response_code) || page.try(:response_code)
19
- end
9
+ def initialize(wait_time:, user_agent:, logger: nil)
10
+ @wait_time = wait_time
11
+ @logger = logger || proc { |values| STDERR.puts(values.to_json) }
12
+ @agent = Mechanize.new
13
+ @user_agent = user_agent
14
+ @agent.user_agent = user_agent
20
15
  end
21
16
 
22
- @logger = proc { |values| STDERR.puts(values.to_json) }
23
- @agent = Mechanize.new
24
- @agent.user_agent = Spidy::Connector::USER_AGENT
25
-
26
- class << self
27
- attr_reader :agent
28
- attr_accessor :logger
17
+ attr_reader :agent
18
+ attr_reader :logger
29
19
 
30
- def call(url, encoding: nil, retry_count: 5, &yielder)
31
- fail 'url is not specified' if url.blank?
32
- if encoding
33
- agent.default_encoding = encoding
34
- agent.force_default_encoding = true
35
- end
36
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
37
- get(url, retry_count, yielder)
20
+ def call(url, encoding: nil, retry_count: 5, &yielder)
21
+ fail 'url is not specified' if url.blank?
22
+ if encoding
23
+ agent.default_encoding = encoding
24
+ agent.force_default_encoding = true
38
25
  end
26
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
+ get(url, retry_count, yielder)
28
+ end
39
29
 
40
- private
30
+ private
41
31
 
42
- def get(url, retry_count, yielder)
43
- connect(url, retry_count, yielder)
44
- rescue Retry => e
45
- logger.call('retry.accessed': Time.current,
46
- 'retry.uri': url,
47
- 'retry.response_code': e.response_code,
48
- 'retry.rest_count': retry_count)
32
+ def get(url, retry_count, yielder)
33
+ connect(url, retry_count, yielder)
34
+ rescue Spidy::Connector::Retry => e
35
+ logger.call('retry.accessed': Time.current,
36
+ 'retry.uri': url,
37
+ 'retry.response_code': e.response_code,
38
+ 'retry.rest_count': retry_count)
49
39
 
50
- @agent = Mechanize.new
51
- @agent.user_agent = Spidy::Connector::USER_AGENT
40
+ @agent = Mechanize.new
41
+ @agent.user_agent = @user_agent
52
42
 
53
- retry_count -= 1
54
- if retry_count.positive?
55
- sleep e.wait_time
56
- retry
57
- end
58
- raise e
43
+ retry_count -= 1
44
+ if retry_count.positive?
45
+ sleep e.wait_time
46
+ retry
59
47
  end
48
+ raise e
49
+ end
60
50
 
61
- def connect(url, retry_count, yielder)
62
- result = nil
63
- agent.get(url) do |page|
64
- fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
51
+ def connect(url, retry_count, yielder)
52
+ result = nil
53
+ agent.get(url) do |page|
54
+ fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
65
55
 
66
- result = yielder.call(page)
67
- end
68
- result
69
- rescue Mechanize::ResponseCodeError => e
70
- raise Retry, error: e if e.response_code == '429'
71
- raise e
56
+ result = yielder.call(page)
72
57
  end
73
-
58
+ result
59
+ rescue Mechanize::ResponseCodeError => e
60
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
+ raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
+ raise e
74
63
  end
75
64
  end
@@ -3,9 +3,19 @@
3
3
  #
4
4
  # OpenURI to JSON.parse
5
5
  #
6
- module Spidy::Connector::Json
7
- def self.call(url)
6
+ class Spidy::Connector::Json
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ def initialize(wait_time: nil, user_agent: nil)
10
+ @user_agent = user_agent
11
+ end
12
+
13
+ def call(url, &block)
8
14
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
15
+ connect(url, &block)
16
+ end
17
+
18
+ def connect(url)
19
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
10
20
  end
11
21
  end
@@ -3,12 +3,18 @@
3
3
  #
4
4
  # xml
5
5
  #
6
- module Spidy::Connector::Xml
7
- def self.call(url)
6
+ class Spidy::Connector::Xml
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ def call(url)
8
10
  fail 'URL is undefined' if url.blank?
9
11
 
10
- OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
12
+ OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
11
13
  yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
14
  end
13
15
  end
16
+
17
+ def initialize(wait_time: nil, user_agent: nil)
18
+ @user_agent = user_agent
19
+ end
14
20
  end
@@ -18,4 +18,9 @@ class Spidy::Console
18
18
  def reload!
19
19
  @definition_file.eval_definition
20
20
  end
21
+
22
+ def connector(url, as:)
23
+ connector = Spidy::Connector.get(as)
24
+ connector.call(url) { |page| break page }
25
+ end
21
26
  end
@@ -16,6 +16,22 @@ module Spidy::Definition
16
16
  spidy.call(source, &yielder)
17
17
  end
18
18
 
19
+ def user_agent(user_agent)
20
+ @user_agent = user_agent
21
+ end
22
+
23
+ def wait_time(wait_time)
24
+ @wait_time = wait_time
25
+ end
26
+
27
+ def socks_proxy(host, port)
28
+ @socks_proxy = { host: host, port: port }
29
+ end
30
+
31
+ def tor?
32
+ Tor.running?
33
+ end
34
+
19
35
  def each(source = nil, name: :default, &yielder)
20
36
  name = name.presence || :default
21
37
  spidy = @namespace[:"#{name}_spider"]
@@ -26,7 +42,7 @@ module Spidy::Definition
26
42
 
27
43
  def spider(name = :default, connector: nil, as: nil, &define_block)
28
44
  @namespace ||= {}
29
- connector = Spidy::Connector.get(connector || as)
45
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
30
46
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
47
  define_block.call(yielder, connector, source)
32
48
  end
@@ -34,7 +50,7 @@ module Spidy::Definition
34
50
 
35
51
  def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
36
52
  @namespace ||= {}
37
- connector = Spidy::Connector.get(connector || as)
53
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
38
54
  binder = Spidy::Binder.get(self, binder || as)
39
55
  @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
40
56
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.2'
4
+ VERSION = '0.2.7'
5
5
  end
@@ -31,7 +31,9 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency 'ffaker'
32
32
  spec.add_development_dependency 'rspec-command'
33
33
 
34
+ spec.add_runtime_dependency 'tor'
34
35
  spec.add_runtime_dependency 'activesupport'
35
36
  spec.add_runtime_dependency 'mechanize'
37
+ spec.add_runtime_dependency 'socksify'
36
38
  spec.add_runtime_dependency 'pry'
37
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-09 00:00:00.000000000 Z
11
+ date: 2020-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tor
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: activesupport
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,20 @@ dependencies:
122
136
  - - ">="
123
137
  - !ruby/object:Gem::Version
124
138
  version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: socksify
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  - !ruby/object:Gem::Dependency
126
154
  name: pry
127
155
  requirement: !ruby/object:Gem::Requirement
@@ -136,7 +164,7 @@ dependencies:
136
164
  - - ">="
137
165
  - !ruby/object:Gem::Version
138
166
  version: '0'
139
- description:
167
+ description:
140
168
  email:
141
169
  - aileron.cc@gmail.com
142
170
  executables:
@@ -174,7 +202,6 @@ files:
174
202
  - lib/spidy/console.rb
175
203
  - lib/spidy/definition.rb
176
204
  - lib/spidy/definition_file.rb
177
- - lib/spidy/interface.rb
178
205
  - lib/spidy/shell.rb
179
206
  - lib/spidy/spider.rb
180
207
  - lib/spidy/version.rb
@@ -184,7 +211,7 @@ homepage: https://github.com/aileron-inc/spidy
184
211
  licenses:
185
212
  - MIT
186
213
  metadata: {}
187
- post_install_message:
214
+ post_install_message:
188
215
  rdoc_options: []
189
216
  require_paths:
190
217
  - lib
@@ -199,8 +226,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
199
226
  - !ruby/object:Gem::Version
200
227
  version: '0'
201
228
  requirements: []
202
- rubygems_version: 3.1.2
203
- signing_key:
229
+ rubygems_version: 3.0.3
230
+ signing_key:
204
231
  specification_version: 4
205
232
  summary: web spider dsl
206
233
  test_files: []
@@ -1,12 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- #
4
- # spidy interface
5
- #
6
- class Spidy::Interface
7
- delegate :call, :each, :namespace, to: :@spidy
8
-
9
- def initialize(spidy)
10
- @spidy = spidy
11
- end
12
- end