spidy 0.3.3 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a70bf5d610f60d0b71c719cf870995a7e93b6e9abd71ef9823a71e2ed506f190
4
- data.tar.gz: 203bc7721020e244b9ad3ecc526b27e1c8d83a807cc2798bcb8930b1e17d5277
3
+ metadata.gz: 75215453c834a8e481b27cf4377235cc97ce6a6e4eff142a11743e68ee4982b4
4
+ data.tar.gz: 25f3b14ad6f31b580396458c8075167f24fd4a2f6e7ff98947338ecd0588eb9d
5
5
  SHA512:
6
- metadata.gz: 9977aeb13ff786bd8fbeb7d8ca0ee3ef7b67dfb577739c9aba0a310c46741fa48d86596de6f69ad05eaf2d64f9de4259bea84d4939ee2c7288781106fb25f2b3
7
- data.tar.gz: d64e9e66b25d8985f2c009abddd6f6b862aa8f533b67e125fa35008d308424e766d3ad4a8a64c16a0c9ed3448cdad14a09c80f6f77437e604826808286270d4f
6
+ metadata.gz: 447b7152b807c7985e16b7b403d27f9f7b949264577e8e4dc11a52358cb9af49510696d29166adb60bb5b87158aa8d2c10faf7c810f8eced4c29f9eed8bb493a
7
+ data.tar.gz: 55d82e5c495a7e5a0fd57b466e08ea072e3a712829b2d83d550b107147f30e3eaff54ed8dcec69163bf6f74e0c32990051af9de0d044cc19eadf39ecc749d003
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.6
1
+ 3.0.2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.3.3)
4
+ spidy (0.3.9)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -11,110 +11,114 @@ PATH
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.0.3.3)
14
+ activesupport (7.0.0)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
- i18n (>= 0.7, < 2)
17
- minitest (~> 5.1)
18
- tzinfo (~> 1.1)
19
- zeitwerk (~> 2.2, >= 2.2.2)
20
- addressable (2.7.0)
16
+ i18n (>= 1.6, < 2)
17
+ minitest (>= 5.1)
18
+ tzinfo (~> 2.0)
19
+ addressable (2.8.0)
21
20
  public_suffix (>= 2.0.2, < 5.0)
22
- capybara (3.33.0)
21
+ capybara (3.36.0)
23
22
  addressable
23
+ matrix
24
24
  mini_mime (>= 0.1.3)
25
25
  nokogiri (~> 1.8)
26
26
  rack (>= 1.6.0)
27
27
  rack-test (>= 0.6.3)
28
- regexp_parser (~> 1.5)
28
+ regexp_parser (>= 1.5, < 3.0)
29
29
  xpath (~> 3.2)
30
30
  capybara_discoball (0.1.0)
31
31
  capybara (>= 2.7, < 4)
32
- coderay (1.1.2)
33
- concurrent-ruby (1.1.7)
34
- connection_pool (2.2.3)
35
- diff-lcs (1.3)
32
+ coderay (1.1.3)
33
+ concurrent-ruby (1.1.9)
34
+ connection_pool (2.2.5)
35
+ diff-lcs (1.4.4)
36
36
  domain_name (0.5.20190701)
37
37
  unf (>= 0.0.5, < 1.0.0)
38
- ffaker (2.10.0)
39
- http-cookie (1.0.3)
38
+ ffaker (2.20.0)
39
+ http-cookie (1.0.4)
40
40
  domain_name (~> 0.5)
41
- i18n (1.8.5)
41
+ i18n (1.8.11)
42
42
  concurrent-ruby (~> 1.0)
43
- mechanize (2.7.6)
44
- domain_name (~> 0.5, >= 0.5.1)
45
- http-cookie (~> 1.0)
46
- mime-types (>= 1.17.2)
47
- net-http-digest_auth (~> 1.1, >= 1.1.1)
48
- net-http-persistent (>= 2.5.2)
49
- nokogiri (~> 1.6)
50
- ntlm-http (~> 0.1, >= 0.1.1)
51
- webrobots (>= 0.0.9, < 0.2)
52
- method_source (0.9.2)
53
- mime-types (3.3.1)
43
+ matrix (0.4.2)
44
+ mechanize (2.8.3)
45
+ addressable (~> 2.8)
46
+ domain_name (~> 0.5, >= 0.5.20190701)
47
+ http-cookie (~> 1.0, >= 1.0.3)
48
+ mime-types (~> 3.0)
49
+ net-http-digest_auth (~> 1.4, >= 1.4.1)
50
+ net-http-persistent (>= 2.5.2, < 5.0.dev)
51
+ nokogiri (~> 1.11, >= 1.11.2)
52
+ rubyntlm (~> 0.6, >= 0.6.3)
53
+ webrick (~> 1.7)
54
+ webrobots (~> 0.1.2)
55
+ method_source (1.0.0)
56
+ mime-types (3.4.1)
54
57
  mime-types-data (~> 3.2015)
55
- mime-types-data (3.2020.0512)
56
- mini_mime (1.0.2)
57
- mini_portile2 (2.4.0)
58
- minitest (5.14.2)
58
+ mime-types-data (3.2021.1115)
59
+ mini_mime (1.1.2)
60
+ mini_portile2 (2.6.1)
61
+ minitest (5.15.0)
59
62
  mixlib-shellout (2.4.4)
60
63
  mustermann (1.1.1)
61
64
  ruby2_keywords (~> 0.0.1)
62
65
  net-http-digest_auth (1.4.1)
63
- net-http-persistent (4.0.0)
66
+ net-http-persistent (4.0.1)
64
67
  connection_pool (~> 2.2)
65
- nokogiri (1.10.10)
66
- mini_portile2 (~> 2.4.0)
67
- ntlm-http (0.1.1)
68
- pry (0.12.2)
69
- coderay (~> 1.1.0)
70
- method_source (~> 0.9.0)
68
+ nokogiri (1.12.5)
69
+ mini_portile2 (~> 2.6.1)
70
+ racc (~> 1.4)
71
+ pry (0.14.1)
72
+ coderay (~> 1.1)
73
+ method_source (~> 1.0)
71
74
  public_suffix (4.0.6)
75
+ racc (1.6.0)
72
76
  rack (2.2.3)
73
- rack-protection (2.0.8.1)
77
+ rack-protection (2.1.0)
74
78
  rack
75
79
  rack-test (1.1.0)
76
80
  rack (>= 1.0, < 3)
77
- rake (10.5.0)
78
- regexp_parser (1.8.1)
79
- rspec (3.8.0)
80
- rspec-core (~> 3.8.0)
81
- rspec-expectations (~> 3.8.0)
82
- rspec-mocks (~> 3.8.0)
81
+ rake (13.0.6)
82
+ regexp_parser (2.2.0)
83
+ rspec (3.10.0)
84
+ rspec-core (~> 3.10.0)
85
+ rspec-expectations (~> 3.10.0)
86
+ rspec-mocks (~> 3.10.0)
83
87
  rspec-command (1.0.3)
84
88
  mixlib-shellout (~> 2.0)
85
89
  rspec (~> 3.2)
86
90
  rspec-its (~> 1.2)
87
- rspec-core (3.8.2)
88
- rspec-support (~> 3.8.0)
89
- rspec-expectations (3.8.4)
91
+ rspec-core (3.10.1)
92
+ rspec-support (~> 3.10.0)
93
+ rspec-expectations (3.10.1)
90
94
  diff-lcs (>= 1.2.0, < 2.0)
91
- rspec-support (~> 3.8.0)
95
+ rspec-support (~> 3.10.0)
92
96
  rspec-its (1.3.0)
93
97
  rspec-core (>= 3.0.0)
94
98
  rspec-expectations (>= 3.0.0)
95
- rspec-mocks (3.8.1)
99
+ rspec-mocks (3.10.2)
96
100
  diff-lcs (>= 1.2.0, < 2.0)
97
- rspec-support (~> 3.8.0)
98
- rspec-support (3.8.2)
99
- ruby2_keywords (0.0.2)
100
- sinatra (2.0.8.1)
101
+ rspec-support (~> 3.10.0)
102
+ rspec-support (3.10.3)
103
+ ruby2_keywords (0.0.5)
104
+ rubyntlm (0.6.3)
105
+ sinatra (2.1.0)
101
106
  mustermann (~> 1.0)
102
- rack (~> 2.0)
103
- rack-protection (= 2.0.8.1)
107
+ rack (~> 2.2)
108
+ rack-protection (= 2.1.0)
104
109
  tilt (~> 2.0)
105
110
  socksify (1.7.1)
106
- thread_safe (0.3.6)
107
111
  tilt (2.0.10)
108
- tor (0.1.4)
109
- tzinfo (1.2.7)
110
- thread_safe (~> 0.1)
112
+ tor (0.1.5)
113
+ tzinfo (2.0.4)
114
+ concurrent-ruby (~> 1.0)
111
115
  unf (0.1.4)
112
116
  unf_ext
113
- unf_ext (0.0.7.7)
117
+ unf_ext (0.0.8)
118
+ webrick (1.7.0)
114
119
  webrobots (0.1.2)
115
120
  xpath (3.2.0)
116
121
  nokogiri (~> 1.8)
117
- zeitwerk (2.4.0)
118
122
 
119
123
  PLATFORMS
120
124
  ruby
@@ -124,11 +128,11 @@ DEPENDENCIES
124
128
  capybara_discoball
125
129
  ffaker
126
130
  pry
127
- rake (~> 10.0)
131
+ rake (~> 13.0)
128
132
  rspec (~> 3.0)
129
133
  rspec-command
130
134
  sinatra
131
135
  spidy!
132
136
 
133
137
  BUNDLED WITH
134
- 2.1.4
138
+ 2.2.22
data/example/wikip.rb ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ Spidy.define do
4
+ def self.infobox_scrape(params, &block)
5
+ call(params.html.at('.infobox'), name: :infobox, &block)
6
+ end
7
+
8
+ define(as: :html) do
9
+ let(:title, 'h1')
10
+ end
11
+
12
+ define(:infobox, as: :html, connector: :direct) do
13
+ let(:columns) do
14
+ html.search('tr').each do |tr|
15
+ {
16
+ name: tr.at('th')&.text,
17
+ value: tr.at('td')&.text
18
+ }
19
+ end
20
+ end
21
+ end
22
+ end
data/exe/spidy CHANGED
@@ -17,7 +17,8 @@ else
17
17
  when 'function' then Spidy.shell(ARGV[1]).function
18
18
  when 'call' then Spidy.shell(ARGV[1]).call(ARGV[2])
19
19
  when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
20
+ when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
20
21
  else
21
- STDOUT.puts 'usage: spidy [console function call each] [file]'
22
+ STDOUT.puts 'usage: spidy [console function call each run] [file]'
22
23
  end
23
24
  end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind html and convert to object
5
5
  #
6
- class Spidy::Binder::Html < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Html
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { html.at(query)&.text } if block.nil?
11
+ return define_method(name) { html.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -20,6 +20,7 @@ class Spidy::Binder::Html < Spidy::Binder::Base
20
20
  fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
-
24
- alias_method :html, :resource
23
+ def self.extended(obj)
24
+ obj.alias_method :html, :resource
25
+ end
25
26
  end
@@ -3,8 +3,8 @@
3
3
  #
4
4
  # Bind json and convert to object
5
5
  #
6
- class Spidy::Binder::Json < Spidy::Binder::Base
7
- def self.let(name, *query, &block)
6
+ module Spidy::Binder::Json
7
+ def let(name, *query, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
@@ -20,6 +20,7 @@ class Spidy::Binder::Json < Spidy::Binder::Base
20
20
  fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
-
24
- alias_method :json, :resource
23
+ def self.extended(obj)
24
+ obj.alias_method :json, :resource
25
+ end
25
26
  end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind xml and convert to object
5
5
  #
6
- class Spidy::Binder::Xml < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Xml
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { xml.at(query)&.text } if block.nil?
11
+ return define_method(name) { xml.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -20,6 +20,7 @@ class Spidy::Binder::Xml < Spidy::Binder::Base
20
20
  fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
-
24
- alias_method :xml, :resource
23
+ def self.extended(obj)
24
+ obj.alias_method :xml, :resource
25
+ end
25
26
  end
data/lib/spidy/binder.rb CHANGED
@@ -8,48 +8,4 @@ module Spidy::Binder
8
8
  autoload :Json
9
9
  autoload :Html
10
10
  autoload :Xml
11
-
12
- class Error < StandardError
13
- end
14
-
15
- class Caller
16
- def initialize(spidy, binder)
17
- @spidy = spidy
18
- @binder = binder
19
- end
20
-
21
- def call(source, url: nil, define: nil, define_name: nil)
22
- yield Class.new(@binder, &define).new(define_name, @spidy, source, url)
23
- end
24
- end
25
-
26
- class Base
27
- class << self
28
- attr_reader :attribute_names
29
- end
30
-
31
- attr_reader :resource, :url
32
-
33
- def initialize(define_name, spidy, resource, url)
34
- @define_name = define_name
35
- @spidy = spidy
36
- @resource = resource
37
- @url = url
38
- end
39
-
40
- def to_s
41
- to_h.to_json
42
- end
43
-
44
- def to_h
45
- self.class.attribute_names.map { |name| [name, send(name)] }.to_h
46
- end
47
- end
48
-
49
-
50
- def self.get(spidy, value)
51
- return Caller.new(spidy, const_get(value.to_s.classify)) if name.is_a?(String) || name.is_a?(Symbol)
52
-
53
- value
54
- end
55
11
  end
@@ -8,6 +8,10 @@ class Spidy::CommandLine
8
8
  class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
9
9
  class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
10
10
 
11
+ def eval_call(script)
12
+ @definition_file.spidy.instance_eval(script)
13
+ end
14
+
11
15
  def initialize(definition_file)
12
16
  @definition_file = definition_file
13
17
  raise 'unloaded spidy' if definition_file.spidy.nil?
@@ -5,7 +5,11 @@
5
5
  #
6
6
  class Spidy::Connector::Direct
7
7
  def call(resource, &yielder)
8
- yielder.call(resource)
8
+ if block_given?
9
+ yield resource
10
+ else
11
+ resource
12
+ end
9
13
  end
10
14
 
11
15
  def initialize(user_agent:)
@@ -126,6 +126,8 @@ module Spidy::Connector
126
126
  wait_time ||= DEFAULT_WAIT_TIME
127
127
 
128
128
  connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
129
+ return connector if connector.is_a?(Spidy::Connector::Direct)
130
+
129
131
  RetryableCaller.new(connector, wait_time: wait_time, logger: logger)
130
132
  end
131
133
 
@@ -0,0 +1,19 @@
1
+ class Spidy::DefineObject
2
+ class << self
3
+ attr_reader :attribute_names
4
+ end
5
+ attr_reader :resource, :url
6
+
7
+ def initialize(resource, url)
8
+ @resource = resource
9
+ @url = url
10
+ end
11
+
12
+ def to_s
13
+ to_h.to_json
14
+ end
15
+
16
+ def to_h
17
+ self.class.attribute_names.map { |name| [name, send(name)] }.to_h
18
+ end
19
+ end
@@ -44,22 +44,20 @@ module Spidy::Definition
44
44
  end
45
45
  end
46
46
 
47
- def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
48
- @namespace ||= {}
47
+ def define(name = :default, connector: nil, as: nil, &define_block)
49
48
  connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
50
- binder = Spidy::Binder.get(self, binder || as)
51
- @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
52
- end
53
-
54
- private
55
-
56
- def define_proc(name, connector, binder, define_block)
57
- proc do |source, &yielder|
58
- yielder = lambda { |result| break result } if yielder.nil?
59
- connection_yielder = lambda do |page|
60
- binder.call(page, url: source, define: define_block, define_name: name) { |object| yielder.call(object) }
49
+ binder_base = Spidy::Binder.const_get(as.to_s.classify)
50
+ @namespace ||= {}
51
+ @namespace[:"#{name}_scraper"] = Class.new(Spidy::DefineObject) do
52
+ extend binder_base
53
+ class_eval(&define_block)
54
+ define_singleton_method(:call) do |source, &yielder|
55
+ yielder = lambda { |result| break result } if yielder.nil?
56
+ connection_yielder = lambda do |page|
57
+ yielder.call(new(page, source))
58
+ end
59
+ connector.call(source, &connection_yielder)
61
60
  end
62
- connector.call(source, &connection_yielder)
63
61
  end
64
62
  end
65
63
  end
data/lib/spidy/shell.rb CHANGED
@@ -16,5 +16,5 @@ class Spidy::Shell
16
16
  Spidy::CommandLine.new(@definition_file)
17
17
  end
18
18
 
19
- delegate :function, :each, :call, to: :command_line
19
+ delegate :function, :each, :call, :eval_call, to: :command_line
20
20
  end
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.3.3'
4
+ VERSION = '0.3.9'
5
5
  end
data/lib/spidy.rb CHANGED
@@ -19,6 +19,7 @@ module Spidy
19
19
  autoload :DefinitionFile
20
20
  autoload :Binder
21
21
  autoload :Connector
22
+ autoload :DefineObject
22
23
 
23
24
  def self.shell(filepath = nil)
24
25
  Spidy::Shell.new(filepath)
data/spidy.gemspec CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
26
26
 
27
27
  spec.add_development_dependency 'bundler', '~> 2.0'
28
28
  spec.add_development_dependency 'pry'
29
- spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'rake', '~> 13.0'
30
30
  spec.add_development_dependency 'rspec', '~> 3.0'
31
31
  spec.add_development_dependency 'ffaker'
32
32
  spec.add_development_dependency 'rspec-command'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-05 00:00:00.000000000 Z
11
+ date: 2021-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: '13.0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: '13.0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -218,6 +218,7 @@ files:
218
218
  - example/master_detail.rb
219
219
  - example/proxy.rb
220
220
  - example/retry.rb
221
+ - example/wikip.rb
221
222
  - exe/spidy
222
223
  - lib/spidy.rb
223
224
  - lib/spidy/binder.rb
@@ -231,6 +232,7 @@ files:
231
232
  - lib/spidy/connector/json.rb
232
233
  - lib/spidy/connector/xml.rb
233
234
  - lib/spidy/console.rb
235
+ - lib/spidy/define_object.rb
234
236
  - lib/spidy/definition.rb
235
237
  - lib/spidy/definition_file.rb
236
238
  - lib/spidy/shell.rb
@@ -257,7 +259,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
257
259
  - !ruby/object:Gem::Version
258
260
  version: '0'
259
261
  requirements: []
260
- rubygems_version: 3.0.3
262
+ rubygems_version: 3.2.22
261
263
  signing_key:
262
264
  specification_version: 4
263
265
  summary: web spider dsl