spidy 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 75215453c834a8e481b27cf4377235cc97ce6a6e4eff142a11743e68ee4982b4
4
- data.tar.gz: 25f3b14ad6f31b580396458c8075167f24fd4a2f6e7ff98947338ecd0588eb9d
3
+ metadata.gz: b6a7d70df09642e17d34cc85e1973914b8b7e151c34670526cb4d6b2d3589227
4
+ data.tar.gz: 012b7def5510c16d68676bada533d452315244fc5d47d7f26a9bd71068a3f9a3
5
5
  SHA512:
6
- metadata.gz: 447b7152b807c7985e16b7b403d27f9f7b949264577e8e4dc11a52358cb9af49510696d29166adb60bb5b87158aa8d2c10faf7c810f8eced4c29f9eed8bb493a
7
- data.tar.gz: 55d82e5c495a7e5a0fd57b466e08ea072e3a712829b2d83d550b107147f30e3eaff54ed8dcec69163bf6f74e0c32990051af9de0d044cc19eadf39ecc749d003
6
+ metadata.gz: 8b6682cd3d1499b115cdfba3964cab62ae65f3a7943fb87f53d8613c2a6553b4f2a4a0728f2af07990dc90ae03c07df4d33914739ab4df462a37c2b04f5efdc5
7
+ data.tar.gz: a2680dd41fb1a6dead95ecd20742560d749f5c3f27367baac7d3f1294d4c6ee7de946cb173e1f2750169e6a341915197e375d1e56ca0faf7c5be4491f5a55ae9
data/.rubocop.yml CHANGED
@@ -1,7 +1,8 @@
1
1
  inherit_from: .rubocop_todo.yml
2
2
  AllCops:
3
+ TargetRubyVersion: 3.0.2
4
+ NewCops: enable
3
5
  DisplayCopNames: true
4
- TargetRubyVersion: 2.6
5
6
 
6
7
  Style/ClassAndModuleChildren:
7
8
  Enabled: false
@@ -9,7 +10,7 @@ Style/ClassAndModuleChildren:
9
10
  Style/SignalException:
10
11
  EnforcedStyle: semantic
11
12
 
12
- Naming/UncommunicativeMethodParamName:
13
+ Naming/MethodParameterName:
13
14
  AllowedNames:
14
15
  - as
15
16
 
@@ -17,8 +18,11 @@ Metrics/AbcSize:
17
18
  Max: 21
18
19
  Exclude:
19
20
 
21
+ Metrics/MethodLength:
22
+ Max: 15
23
+
20
24
  Metrics/LineLength:
21
- Max: 120
25
+ Max: 130
22
26
 
23
27
  Metrics/BlockLength:
24
28
  Max: 120
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.0.2
1
+ 3.4.2
data/CLAUDE.md ADDED
@@ -0,0 +1,28 @@
1
+ # Claude Helper for Spidy
2
+
3
+ ## Build/Test/Lint Commands
4
+ - Install dependencies: `bundle install`
5
+ - Run all tests: `bundle exec rake spec`
6
+ - Run single test: `bundle exec rspec spec/path/to_spec.rb:LINE_NUMBER`
7
+ - Install gem locally: `bundle exec rake install`
8
+ - Release gem: `bundle exec rake release`
9
+
10
+ ## Code Style Guidelines
11
+ - **Naming Conventions**:
12
+ - snake_case for methods/variables/files
13
+ - CamelCase for classes/modules
14
+ - SCREAMING_SNAKE_CASE for constants
15
+ - **File Organization**: Match file paths to module/class hierarchy
16
+ - **Imports**:
17
+ - Add `# frozen_string_literal: true` at file start
18
+ - Use `extend ActiveSupport::Autoload` for modules with sub-modules
19
+ - **Error Handling**: Create custom error classes inheriting from StandardError
20
+ - **Documentation**: Add brief comments before classes and methods
21
+ - **Testing**:
22
+ - Use RSpec with `expect` syntax
23
+ - Organize with `describe` and `specify` blocks
24
+ - Name test files with `_spec.rb` suffix
25
+
26
+ ## Dependencies
27
+ - Runtime: activesupport, mechanize, socksify, tor
28
+ - Development: bundler, capybara_discoball, ffaker, rake, rspec, sinatra
data/Gemfile CHANGED
@@ -4,3 +4,7 @@ source 'https://rubygems.org'
4
4
 
5
5
  # Specify your gem's dependencies in crawler.gemspec
6
6
  gemspec
7
+
8
+ gem 'webrick'
9
+ gem 'rackup'
10
+ gem 'irb'
data/Gemfile.lock CHANGED
@@ -1,138 +1,193 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.3.9)
5
- activesupport
4
+ spidy (0.3.12)
5
+ activesupport (~> 7.1)
6
6
  mechanize
7
- pry
8
7
  socksify
9
8
  tor
10
9
 
11
10
  GEM
12
11
  remote: https://rubygems.org/
13
12
  specs:
14
- activesupport (7.0.0)
15
- concurrent-ruby (~> 1.0, >= 1.0.2)
13
+ activesupport (7.2.2.1)
14
+ base64
15
+ benchmark (>= 0.3)
16
+ bigdecimal
17
+ concurrent-ruby (~> 1.0, >= 1.3.1)
18
+ connection_pool (>= 2.2.5)
19
+ drb
16
20
  i18n (>= 1.6, < 2)
21
+ logger (>= 1.4.2)
17
22
  minitest (>= 5.1)
18
- tzinfo (~> 2.0)
19
- addressable (2.8.0)
20
- public_suffix (>= 2.0.2, < 5.0)
21
- capybara (3.36.0)
23
+ securerandom (>= 0.3)
24
+ tzinfo (~> 2.0, >= 2.0.5)
25
+ addressable (2.8.7)
26
+ public_suffix (>= 2.0.2, < 7.0)
27
+ base64 (0.2.0)
28
+ benchmark (0.4.0)
29
+ bigdecimal (3.1.9)
30
+ capybara (3.40.0)
22
31
  addressable
23
32
  matrix
24
33
  mini_mime (>= 0.1.3)
25
- nokogiri (~> 1.8)
34
+ nokogiri (~> 1.11)
26
35
  rack (>= 1.6.0)
27
36
  rack-test (>= 0.6.3)
28
37
  regexp_parser (>= 1.5, < 3.0)
29
38
  xpath (~> 3.2)
30
39
  capybara_discoball (0.1.0)
31
40
  capybara (>= 2.7, < 4)
32
- coderay (1.1.3)
33
- concurrent-ruby (1.1.9)
34
- connection_pool (2.2.5)
35
- diff-lcs (1.4.4)
36
- domain_name (0.5.20190701)
37
- unf (>= 0.0.5, < 1.0.0)
38
- ffaker (2.20.0)
39
- http-cookie (1.0.4)
41
+ concurrent-ruby (1.3.5)
42
+ connection_pool (2.5.0)
43
+ date (3.4.1)
44
+ diff-lcs (1.6.0)
45
+ domain_name (0.6.20240107)
46
+ drb (2.2.1)
47
+ ffaker (2.24.0)
48
+ http-cookie (1.0.8)
40
49
  domain_name (~> 0.5)
41
- i18n (1.8.11)
50
+ i18n (1.14.7)
42
51
  concurrent-ruby (~> 1.0)
52
+ io-console (0.8.0)
53
+ irb (1.15.1)
54
+ pp (>= 0.6.0)
55
+ rdoc (>= 4.0.0)
56
+ reline (>= 0.4.2)
57
+ logger (1.6.6)
43
58
  matrix (0.4.2)
44
- mechanize (2.8.3)
59
+ mechanize (2.14.0)
45
60
  addressable (~> 2.8)
61
+ base64
46
62
  domain_name (~> 0.5, >= 0.5.20190701)
47
63
  http-cookie (~> 1.0, >= 1.0.3)
48
- mime-types (~> 3.0)
64
+ mime-types (~> 3.3)
49
65
  net-http-digest_auth (~> 1.4, >= 1.4.1)
50
66
  net-http-persistent (>= 2.5.2, < 5.0.dev)
67
+ nkf
51
68
  nokogiri (~> 1.11, >= 1.11.2)
52
69
  rubyntlm (~> 0.6, >= 0.6.3)
53
70
  webrick (~> 1.7)
54
71
  webrobots (~> 0.1.2)
55
- method_source (1.0.0)
56
- mime-types (3.4.1)
72
+ mime-types (3.6.1)
73
+ logger
57
74
  mime-types-data (~> 3.2015)
58
- mime-types-data (3.2021.1115)
59
- mini_mime (1.1.2)
60
- mini_portile2 (2.6.1)
61
- minitest (5.15.0)
75
+ mime-types-data (3.2025.0318)
76
+ mini_mime (1.1.5)
77
+ minitest (5.25.5)
62
78
  mixlib-shellout (2.4.4)
63
- mustermann (1.1.1)
79
+ mustermann (3.0.3)
64
80
  ruby2_keywords (~> 0.0.1)
65
81
  net-http-digest_auth (1.4.1)
66
- net-http-persistent (4.0.1)
82
+ net-http-persistent (4.0.5)
67
83
  connection_pool (~> 2.2)
68
- nokogiri (1.12.5)
69
- mini_portile2 (~> 2.6.1)
84
+ nkf (0.2.0)
85
+ nokogiri (1.18.5-aarch64-linux-gnu)
86
+ racc (~> 1.4)
87
+ nokogiri (1.18.5-aarch64-linux-musl)
88
+ racc (~> 1.4)
89
+ nokogiri (1.18.5-arm-linux-gnu)
90
+ racc (~> 1.4)
91
+ nokogiri (1.18.5-arm-linux-musl)
92
+ racc (~> 1.4)
93
+ nokogiri (1.18.5-arm64-darwin)
94
+ racc (~> 1.4)
95
+ nokogiri (1.18.5-x86_64-darwin)
96
+ racc (~> 1.4)
97
+ nokogiri (1.18.5-x86_64-linux-gnu)
98
+ racc (~> 1.4)
99
+ nokogiri (1.18.5-x86_64-linux-musl)
70
100
  racc (~> 1.4)
71
- pry (0.14.1)
72
- coderay (~> 1.1)
73
- method_source (~> 1.0)
74
- public_suffix (4.0.6)
75
- racc (1.6.0)
76
- rack (2.2.3)
77
- rack-protection (2.1.0)
78
- rack
79
- rack-test (1.1.0)
80
- rack (>= 1.0, < 3)
81
- rake (13.0.6)
82
- regexp_parser (2.2.0)
83
- rspec (3.10.0)
84
- rspec-core (~> 3.10.0)
85
- rspec-expectations (~> 3.10.0)
86
- rspec-mocks (~> 3.10.0)
101
+ pp (0.6.2)
102
+ prettyprint
103
+ prettyprint (0.2.0)
104
+ psych (5.2.3)
105
+ date
106
+ stringio
107
+ public_suffix (6.0.1)
108
+ racc (1.8.1)
109
+ rack (3.1.12)
110
+ rack-protection (4.1.1)
111
+ base64 (>= 0.1.0)
112
+ logger (>= 1.6.0)
113
+ rack (>= 3.0.0, < 4)
114
+ rack-session (2.1.0)
115
+ base64 (>= 0.1.0)
116
+ rack (>= 3.0.0)
117
+ rack-test (2.2.0)
118
+ rack (>= 1.3)
119
+ rackup (2.2.1)
120
+ rack (>= 3)
121
+ rake (13.2.1)
122
+ rdoc (6.12.0)
123
+ psych (>= 4.0.0)
124
+ regexp_parser (2.10.0)
125
+ reline (0.6.0)
126
+ io-console (~> 0.5)
127
+ rspec (3.13.0)
128
+ rspec-core (~> 3.13.0)
129
+ rspec-expectations (~> 3.13.0)
130
+ rspec-mocks (~> 3.13.0)
87
131
  rspec-command (1.0.3)
88
132
  mixlib-shellout (~> 2.0)
89
133
  rspec (~> 3.2)
90
134
  rspec-its (~> 1.2)
91
- rspec-core (3.10.1)
92
- rspec-support (~> 3.10.0)
93
- rspec-expectations (3.10.1)
135
+ rspec-core (3.13.3)
136
+ rspec-support (~> 3.13.0)
137
+ rspec-expectations (3.13.3)
94
138
  diff-lcs (>= 1.2.0, < 2.0)
95
- rspec-support (~> 3.10.0)
96
- rspec-its (1.3.0)
139
+ rspec-support (~> 3.13.0)
140
+ rspec-its (1.3.1)
97
141
  rspec-core (>= 3.0.0)
98
142
  rspec-expectations (>= 3.0.0)
99
- rspec-mocks (3.10.2)
143
+ rspec-mocks (3.13.2)
100
144
  diff-lcs (>= 1.2.0, < 2.0)
101
- rspec-support (~> 3.10.0)
102
- rspec-support (3.10.3)
145
+ rspec-support (~> 3.13.0)
146
+ rspec-support (3.13.2)
103
147
  ruby2_keywords (0.0.5)
104
- rubyntlm (0.6.3)
105
- sinatra (2.1.0)
106
- mustermann (~> 1.0)
107
- rack (~> 2.2)
108
- rack-protection (= 2.1.0)
148
+ rubyntlm (0.6.5)
149
+ base64
150
+ securerandom (0.4.1)
151
+ sinatra (4.1.1)
152
+ logger (>= 1.6.0)
153
+ mustermann (~> 3.0)
154
+ rack (>= 3.0.0, < 4)
155
+ rack-protection (= 4.1.1)
156
+ rack-session (>= 2.0.0, < 3)
109
157
  tilt (~> 2.0)
110
158
  socksify (1.7.1)
111
- tilt (2.0.10)
112
- tor (0.1.5)
113
- tzinfo (2.0.4)
159
+ stringio (3.1.5)
160
+ tilt (2.6.0)
161
+ tor (0.1.7)
162
+ tzinfo (2.0.6)
114
163
  concurrent-ruby (~> 1.0)
115
- unf (0.1.4)
116
- unf_ext
117
- unf_ext (0.0.8)
118
- webrick (1.7.0)
164
+ webrick (1.9.1)
119
165
  webrobots (0.1.2)
120
166
  xpath (3.2.0)
121
167
  nokogiri (~> 1.8)
122
168
 
123
169
  PLATFORMS
124
- ruby
170
+ aarch64-linux-gnu
171
+ aarch64-linux-musl
172
+ arm-linux-gnu
173
+ arm-linux-musl
174
+ arm64-darwin
175
+ x86_64-darwin
176
+ x86_64-linux-gnu
177
+ x86_64-linux-musl
125
178
 
126
179
  DEPENDENCIES
127
180
  bundler (~> 2.0)
128
181
  capybara_discoball
129
182
  ffaker
130
- pry
183
+ irb
184
+ rackup
131
185
  rake (~> 13.0)
132
186
  rspec (~> 3.0)
133
187
  rspec-command
134
188
  sinatra
135
189
  spidy!
190
+ webrick
136
191
 
137
192
  BUNDLED WITH
138
- 2.2.22
193
+ 2.6.5
data/README.md CHANGED
@@ -51,7 +51,7 @@ spidy console website.rb
51
51
 
52
52
  ### reload source code
53
53
  ```
54
- pry(#<Spidy::Console>)> reload!
54
+ irb(#<Spidy::Console>)> reload!
55
55
  ```
56
56
 
57
57
  ```rb
data/bin/console CHANGED
@@ -3,12 +3,11 @@
3
3
 
4
4
  require 'bundler/setup'
5
5
  require 'spidy'
6
+ require 'irb'
6
7
 
7
8
  # You can add fixtures and/or initialization code here to make experimenting
8
9
  # with your gem easier. You can also use a different console, if you like.
9
10
 
10
- # (If you use this, don't forget to add pry to your Gemfile!)
11
- require 'pry'
12
11
  def reload!
13
12
  ActiveSupport::Dependencies.clear
14
13
  ActiveSupport::DescendantsTracker.clear
@@ -18,5 +17,5 @@ end
18
17
  if ARGV[0]
19
18
  Spidy.open(ARGV[0]).console
20
19
  else
21
- Pry.start
20
+ IRB.start
22
21
  end
@@ -1,7 +1,7 @@
1
-
1
+ # frozen_string_literal: true
2
2
 
3
3
  Spidy.define do
4
- url_to_params = ->(url) {
4
+ url_to_params = lambda { |url|
5
5
  uri = URI.parse(url)
6
6
  params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
7
  params if params.present?
@@ -13,41 +13,41 @@ Spidy.define do
13
13
 
14
14
  limit_page = 3
15
15
  per_page = 25
16
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
- doc.html {
18
- doc.body {
19
- doc.span.bold {
20
- doc.text "Hello world"
21
- }
22
- doc.main {
23
- (page * per_page + 1).upto((page + 1) * per_page).each do |i|
16
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
17
+ doc.html do
18
+ doc.body do
19
+ doc.span.bold do
20
+ doc.text 'Hello world'
21
+ end
22
+ doc.main do
23
+ ((page * per_page) + 1).upto((page + 1) * per_page).each do |i|
24
24
  doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
25
  end
26
- }
26
+ end
27
27
  doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
- }
29
- }
30
- }.doc)
28
+ end
29
+ end
30
+ end.doc)
31
31
  }
32
32
 
33
33
  detail_page = proc { |url, &yielder|
34
34
  params = url_to_params.call(url)
35
35
  id = params['id']
36
36
 
37
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
- doc.html {
39
- doc.body {
40
- doc.span.bold {
41
- doc.text "Hello world"
42
- }
37
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
38
+ doc.html do
39
+ doc.body do
40
+ doc.span.bold do
41
+ doc.text 'Hello world'
42
+ end
43
43
  doc.h1("title_#{id}", id: 'title')
44
44
  doc.main("body_#{id}", id: 'body')
45
45
  doc.div.sub do
46
46
  doc.span.name('testtest')
47
47
  end
48
- }
49
- }
50
- }.doc)
48
+ end
49
+ end
50
+ end.doc)
51
51
  }
52
52
 
53
53
  define(as: :html, connector: detail_page) do
data/example/proxy.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
5
  socks_proxy '127.0.0.1', 9050
data/example/retry.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  spider(as: :json) do |yielder, connector|
3
5
  connector.call('https://httpbin.org/status/500') do |json|
data/example/wikip.rb CHANGED
@@ -11,11 +11,8 @@ Spidy.define do
11
11
 
12
12
  define(:infobox, as: :html, connector: :direct) do
13
13
  let(:columns) do
14
- html.search('tr').each do |tr|
15
- {
16
- name: tr.at('th')&.text,
17
- value: tr.at('td')&.text
18
- }
14
+ html.search('tr').map do |tr|
15
+ { name: tr.at('th')&.text, value: tr.at('td')&.text }
19
16
  end
20
17
  end
21
18
  end
data/exe/spidy CHANGED
@@ -2,14 +2,13 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'spidy'
5
- require 'pry'
6
5
 
7
6
  if ARGV[1].blank?
8
7
  case ARGV[0]
9
- when 'version' then STDOUT.puts(Spidy::VERSION)
8
+ when 'version' then $stdout.puts(Spidy::VERSION)
10
9
  when 'console' then Spidy.shell.interactive
11
10
  else
12
- STDOUT.puts 'usage: spidy [version console]'
11
+ $stdout.puts 'usage: spidy [version console]'
13
12
  end
14
13
  else
15
14
  case ARGV[0]
@@ -19,6 +18,6 @@ else
19
18
  when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
20
19
  when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
21
20
  else
22
- STDOUT.puts 'usage: spidy [console function call each run] [file]'
21
+ $stdout.puts 'usage: spidy [console function call each run] [file]'
23
22
  end
24
23
  end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Spidy::Binder::Error < StandardError
4
+ end
@@ -17,9 +17,10 @@ module Spidy::Binder::Html
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
+
23
24
  def self.extended(obj)
24
25
  obj.alias_method :html, :resource
25
26
  end
@@ -17,9 +17,10 @@ module Spidy::Binder::Json
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
+
23
24
  def self.extended(obj)
24
25
  obj.alias_method :json, :resource
25
26
  end
@@ -17,9 +17,10 @@ module Spidy::Binder::Xml
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
+
23
24
  def self.extended(obj)
24
25
  obj.alias_method :xml, :resource
25
26
  end
data/lib/spidy/binder.rb CHANGED
@@ -5,6 +5,7 @@
5
5
  #
6
6
  module Spidy::Binder
7
7
  extend ActiveSupport::Autoload
8
+ autoload :Error
8
9
  autoload :Json
9
10
  autoload :Html
10
11
  autoload :Xml
@@ -5,8 +5,10 @@
5
5
  #
6
6
  class Spidy::CommandLine
7
7
  delegate :spidy, to: :@definition_file
8
- class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
9
- class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
8
+ class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
9
+ class_attribute :error_handler, default: (proc { |e, url|
10
+ warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
11
+ })
10
12
 
11
13
  def eval_call(script)
12
14
  @definition_file.spidy.instance_eval(script)
@@ -14,40 +16,36 @@ class Spidy::CommandLine
14
16
 
15
17
  def initialize(definition_file)
16
18
  @definition_file = definition_file
17
- raise 'unloaded spidy' if definition_file.spidy.nil?
19
+ fail 'unloaded spidy' if definition_file.spidy.nil?
18
20
  end
19
21
 
20
22
  def each_stdin_lines(name)
21
- STDIN.each_line do |url|
22
- begin
23
- spidy.each(url.strip, name: name, &output)
24
- rescue => e
25
- error_handler.call(e, url)
26
- end
23
+ $stdin.each_line do |url|
24
+ spidy.each(url.strip, name: name, &output)
25
+ rescue StandardError => e
26
+ error_handler.call(e, url)
27
27
  end
28
28
  end
29
29
 
30
30
  def call_stdin_lines(name)
31
- STDIN.each_line do |url|
32
- begin
33
- spidy.call(url.strip, name: name, &output)
34
- rescue => e
35
- error_handler.call(e, url)
36
- end
31
+ $stdin.each_line do |url|
32
+ spidy.call(url.strip, name: name, &output)
33
+ rescue StandardError => e
34
+ error_handler.call(e, url)
37
35
  end
38
36
  end
39
37
 
40
38
  def call(name)
41
- return call_stdin_lines(name) if FileTest.pipe?(STDIN)
42
- spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
43
- rescue => e
39
+ return call_stdin_lines(name) if FileTest.pipe?($stdin)
40
+ spidy.call(name: name, &output) unless FileTest.pipe?($stdin)
41
+ rescue StandardError => e
44
42
  error_handler.call(e, nil)
45
43
  end
46
44
 
47
45
  def each(name)
48
- return each_stdin_lines(name) if FileTest.pipe?(STDIN)
46
+ return each_stdin_lines(name) if FileTest.pipe?($stdin)
49
47
  spidy.each(name: name, &output)
50
- rescue => e
48
+ rescue StandardError => e
51
49
  error_handler.call(e, nil)
52
50
  end
53
51
 
@@ -63,36 +61,32 @@ class Spidy::CommandLine
63
61
  end
64
62
 
65
63
  def build(name)
66
- build_shell(name)
67
- build_ruby(name)
64
+ File.write("#{name}.sh", build_shell_script(name))
65
+ File.write("#{name}.rb", build_ruby_script)
68
66
  end
69
67
 
70
68
  def build_shell(name)
71
- File.open("#{name}.sh", 'w') do |f|
72
- f.write <<~SHELL
73
- #!/bin/bash
74
- eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
75
- spider example
76
- SHELL
77
- end
69
+ <<~SHELL
70
+ #!/bin/bash
71
+ eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
72
+ spider
73
+ SHELL
78
74
  end
79
75
 
80
- def build_ruby(name)
81
- File.open("#{name}.rb", 'w') do |f|
82
- f.write <<~RUBY
83
- # frozen_string_literal: true
76
+ def build_ruby
77
+ <<~RUBY
78
+ # frozen_string_literal: true
84
79
 
85
- Spidy.define do
86
- spider(:example) do |yielder, connector|
87
- # connector.call(url) do |resource|
88
- # yielder.call(url or resource)
89
- # end
90
- end
80
+ Spidy.define do
81
+ spider(as: :html) do |yielder, connector|
82
+ # connector.call(url) do |resource|
83
+ # yielder.call(url or resource)
84
+ # end
85
+ end
91
86
 
92
- define(:example) do
93
- end
87
+ define(as: :html) do
94
88
  end
95
- RUBY
96
- end
89
+ end
90
+ RUBY
97
91
  end
98
92
  end
@@ -4,7 +4,7 @@
4
4
  # Direct resource ( not network resource )
5
5
  #
6
6
  class Spidy::Connector::Direct
7
- def call(resource, &yielder)
7
+ def call(resource)
8
8
  if block_given?
9
9
  yield resource
10
10
  else
@@ -12,6 +12,5 @@ class Spidy::Connector::Direct
12
12
  end
13
13
  end
14
14
 
15
- def initialize(user_agent:)
16
- end
15
+ def initialize(user_agent:); end
17
16
  end
@@ -14,13 +14,13 @@ class Spidy::Connector::Html
14
14
 
15
15
  attr_reader :agent
16
16
 
17
- def call(url, encoding: nil, retry_count: 5, &yielder)
17
+ def call(url, encoding: nil, &yielder)
18
18
  fail 'url is not specified' if url.blank?
19
19
  if encoding
20
20
  agent.default_encoding = encoding
21
21
  agent.force_default_encoding = true
22
22
  end
23
- connect(url, retry_count, yielder)
23
+ connect(url, yielder)
24
24
  end
25
25
 
26
26
  def refresh!
@@ -30,17 +30,19 @@ class Spidy::Connector::Html
30
30
 
31
31
  private
32
32
 
33
- def connect(url, retry_count, yielder)
33
+ def connect(url, yielder)
34
34
  result = nil
35
35
  agent.get(url) do |page|
36
- fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
36
+ if page.title == 'Sorry, unable to access page...'
37
+ fail Spidy::Connector::Retry.new(object: page, response_code: page.try(:response_code))
38
+ end
37
39
 
38
40
  result = yielder.call(page)
39
41
  end
40
42
  result
41
43
  rescue Mechanize::ResponseCodeError => e
42
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
44
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '429'
45
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '502'
46
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code))
45
47
  end
46
48
  end
@@ -17,9 +17,9 @@ class Spidy::Connector::Json
17
17
  connect(url, &block)
18
18
  end
19
19
 
20
- def connect(url, retry_count: 5)
21
- OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
20
+ def connect(url)
21
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
22
  rescue OpenURI::HTTPError => e
23
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
23
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
24
24
  end
25
25
  end
@@ -13,11 +13,11 @@ class Spidy::Connector::Xml
13
13
  end
14
14
 
15
15
  def connect(url, &block)
16
- OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
16
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
17
17
  block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
18
18
  end
19
19
  rescue OpenURI::HTTPError => e
20
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
20
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
21
21
  end
22
22
 
23
23
  def initialize(user_agent:)
@@ -27,7 +27,7 @@ module Spidy::Connector
27
27
  #
28
28
  # error output logger
29
29
  #
30
- DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
30
+ DEFAULT_LOGGER = proc { |values| warn(values.to_json) }
31
31
 
32
32
  #
33
33
  # static method
@@ -36,7 +36,9 @@ module Spidy::Connector
36
36
  extend ActiveSupport::Concern
37
37
  class_methods do
38
38
  def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
- ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(url, &block)
39
+ ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
40
+ url, &block
41
+ )
40
42
  end
41
43
  end
42
44
  end
@@ -51,6 +53,7 @@ module Spidy::Connector
51
53
  @object = object
52
54
  @response_code = response_code
53
55
  @error = error
56
+ super(error)
54
57
  end
55
58
  end
56
59
 
@@ -58,13 +61,13 @@ module Spidy::Connector
58
61
  # retry
59
62
  #
60
63
  class RetryableCaller
61
- attr_reader :origin_connector
64
+ attr_reader :origin_connector, :logger, :wait_time
62
65
 
63
- def initialize(connector, logger:, wait_time:)
66
+ def initialize(connector, logger:, wait_time:, retry_attempt_count: 5)
64
67
  @origin_connector = connector
65
68
  @logger = logger
66
69
  @wait_time = wait_time
67
- @retry_attempt_count = 5
70
+ @retry_attempt_count = retry_attempt_count
68
71
  end
69
72
 
70
73
  def call(url, &block)
@@ -73,18 +76,18 @@ module Spidy::Connector
73
76
  end
74
77
 
75
78
  def connect(url, retry_attempt_count: @retry_attempt_count, &block)
76
- @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
77
- @origin_connector.call(url, &block)
79
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
80
+ origin_connector.call(url, &block)
78
81
  rescue Spidy::Connector::Retry => e
79
- @logger.call('retry.accessed': Time.current,
80
- 'retry.uri': url,
81
- 'retry.response_code': e.response_code,
82
- 'retry.attempt_count': retry_attempt_count)
82
+ logger.call('retry.accessed': Time.current,
83
+ 'retry.uri': url,
84
+ 'retry.response_code': e.response_code,
85
+ 'retry.attempt_count': retry_attempt_count)
83
86
 
84
87
  retry_attempt_count -= 1
85
88
  if retry_attempt_count.positive?
86
- sleep @wait_time
87
- @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
89
+ sleep wait_time
90
+ origin_connector.refresh! if origin_connector.respond_to?(:refresh!)
88
91
  retry
89
92
  end
90
93
  raise e.error
@@ -103,7 +106,7 @@ module Spidy::Connector
103
106
  end
104
107
 
105
108
  def call(url, &block)
106
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
109
+ Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
107
110
  connector.call(url, &block)
108
111
  end
109
112
  end
@@ -141,7 +144,6 @@ module Spidy::Connector
141
144
  fail "Not defined connnector[#{value}]" if connector.nil?
142
145
  return connector if socks_proxy.nil?
143
146
 
144
- tor = TorConnector.new(connector, socks_proxy)
145
- tor
147
+ TorConnector.new(connector, socks_proxy)
146
148
  end
147
149
  end
@@ -33,26 +33,34 @@ module Spidy::Definition
33
33
  spidy = @namespace[:"#{name}_spider"]
34
34
  fail "undefined spidy [#{name}]" if spidy.nil?
35
35
 
36
- spidy.call(source, &yielder)
36
+ if yielder
37
+ spidy.call(source, &yielder)
38
+ else
39
+ Enumerator.new do |enumerate_yielder|
40
+ spidy.call(source, &enumerate_yielder)
41
+ end
42
+ end
37
43
  end
38
44
 
39
45
  def spider(name = :default, connector: nil, as: nil, &define_block)
40
46
  @namespace ||= {}
41
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
47
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
48
+ socks_proxy: @socks_proxy)
42
49
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
43
50
  define_block.call(yielder, connector, source)
44
51
  end
45
52
  end
46
53
 
47
54
  def define(name = :default, connector: nil, as: nil, &define_block)
48
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
55
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
56
+ socks_proxy: @socks_proxy)
49
57
  binder_base = Spidy::Binder.const_get(as.to_s.classify)
50
58
  @namespace ||= {}
51
- @namespace[:"#{name}_scraper"] = Class.new(Spidy::DefineObject) do
59
+ @namespace[:"#{name}_scraper"] = Class.new(Spidy::DefinitionObject) do
52
60
  extend binder_base
53
61
  class_eval(&define_block)
54
62
  define_singleton_method(:call) do |source, &yielder|
55
- yielder = lambda { |result| break result } if yielder.nil?
63
+ yielder = ->(result) { break result } if yielder.nil?
56
64
  connection_yielder = lambda do |page|
57
65
  yielder.call(new(page, source))
58
66
  end
@@ -4,8 +4,7 @@
4
4
  # spidy interface binding
5
5
  #
6
6
  class Spidy::DefinitionFile
7
- attr_reader :path
8
- attr_reader :spidy
7
+ attr_reader :path, :spidy
9
8
 
10
9
  def self.open(filepath)
11
10
  object = new(filepath)
@@ -15,7 +14,7 @@ class Spidy::DefinitionFile
15
14
 
16
15
  # rubocop:disable Security/Eval
17
16
  def eval_definition
18
- @spidy = eval(File.open(path).read) if path
17
+ @spidy = eval(File.read(path)) if path
19
18
  end
20
19
  # rubocop:enable Security/Eval
21
20
 
@@ -1,4 +1,9 @@
1
- class Spidy::DefineObject
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # An object that represents the scraper defined by define block.
5
+ #
6
+ class Spidy::DefinitionObject
2
7
  class << self
3
8
  attr_reader :attribute_names
4
9
  end
@@ -14,6 +19,6 @@ class Spidy::DefineObject
14
19
  end
15
20
 
16
21
  def to_h
17
- self.class.attribute_names.map { |name| [name, send(name)] }.to_h
22
+ self.class.attribute_names.to_h { |name| [name, send(name)] }
18
23
  end
19
24
  end
data/lib/spidy/shell.rb CHANGED
@@ -9,7 +9,12 @@ class Spidy::Shell
9
9
  end
10
10
 
11
11
  def interactive
12
- Pry.start(Spidy::Console.new(@definition_file))
12
+ console = Spidy::Console.new(@definition_file)
13
+ require 'irb'
14
+ IRB.setup(nil)
15
+ irb = IRB::Irb.new(IRB::WorkSpace.new(console))
16
+ IRB.conf[:MAIN_CONTEXT] = irb.context
17
+ irb.eval_input
13
18
  end
14
19
 
15
20
  def command_line
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.3.9'
4
+ VERSION = '0.4.0'
5
5
  end
data/lib/spidy.rb CHANGED
@@ -17,9 +17,9 @@ module Spidy
17
17
  autoload :Console
18
18
  autoload :Definition
19
19
  autoload :DefinitionFile
20
+ autoload :DefinitionObject
20
21
  autoload :Binder
21
22
  autoload :Connector
22
- autoload :DefineObject
23
23
 
24
24
  def self.shell(filepath = nil)
25
25
  Spidy::Shell.new(filepath)
data/spidy.gemspec CHANGED
@@ -25,17 +25,18 @@ Gem::Specification.new do |spec|
25
25
  spec.require_paths = ['lib']
26
26
 
27
27
  spec.add_development_dependency 'bundler', '~> 2.0'
28
- spec.add_development_dependency 'pry'
28
+ spec.add_development_dependency 'capybara_discoball'
29
+ spec.add_development_dependency 'ffaker'
29
30
  spec.add_development_dependency 'rake', '~> 13.0'
30
31
  spec.add_development_dependency 'rspec', '~> 3.0'
31
- spec.add_development_dependency 'ffaker'
32
32
  spec.add_development_dependency 'rspec-command'
33
- spec.add_development_dependency 'capybara_discoball'
34
33
  spec.add_development_dependency 'sinatra'
35
34
 
36
- spec.add_runtime_dependency 'tor'
37
- spec.add_runtime_dependency 'activesupport'
35
+ spec.add_runtime_dependency 'activesupport', '~> 7.1'
38
36
  spec.add_runtime_dependency 'mechanize'
39
37
  spec.add_runtime_dependency 'socksify'
40
- spec.add_runtime_dependency 'pry'
38
+ spec.add_runtime_dependency 'tor'
39
+ spec.metadata = {
40
+ 'rubygems_mfa_required' => 'true'
41
+ }
41
42
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.9
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2021-12-23 00:00:00.000000000 Z
10
+ date: 2025-03-19 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bundler
@@ -25,7 +24,7 @@ dependencies:
25
24
  - !ruby/object:Gem::Version
26
25
  version: '2.0'
27
26
  - !ruby/object:Gem::Dependency
28
- name: pry
27
+ name: capybara_discoball
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
30
  - - ">="
@@ -38,34 +37,6 @@ dependencies:
38
37
  - - ">="
39
38
  - !ruby/object:Gem::Version
40
39
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '13.0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '13.0'
55
- - !ruby/object:Gem::Dependency
56
- name: rspec
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '3.0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '3.0'
69
40
  - !ruby/object:Gem::Dependency
70
41
  name: ffaker
71
42
  requirement: !ruby/object:Gem::Requirement
@@ -81,35 +52,35 @@ dependencies:
81
52
  - !ruby/object:Gem::Version
82
53
  version: '0'
83
54
  - !ruby/object:Gem::Dependency
84
- name: rspec-command
55
+ name: rake
85
56
  requirement: !ruby/object:Gem::Requirement
86
57
  requirements:
87
- - - ">="
58
+ - - "~>"
88
59
  - !ruby/object:Gem::Version
89
- version: '0'
60
+ version: '13.0'
90
61
  type: :development
91
62
  prerelease: false
92
63
  version_requirements: !ruby/object:Gem::Requirement
93
64
  requirements:
94
- - - ">="
65
+ - - "~>"
95
66
  - !ruby/object:Gem::Version
96
- version: '0'
67
+ version: '13.0'
97
68
  - !ruby/object:Gem::Dependency
98
- name: capybara_discoball
69
+ name: rspec
99
70
  requirement: !ruby/object:Gem::Requirement
100
71
  requirements:
101
- - - ">="
72
+ - - "~>"
102
73
  - !ruby/object:Gem::Version
103
- version: '0'
74
+ version: '3.0'
104
75
  type: :development
105
76
  prerelease: false
106
77
  version_requirements: !ruby/object:Gem::Requirement
107
78
  requirements:
108
- - - ">="
79
+ - - "~>"
109
80
  - !ruby/object:Gem::Version
110
- version: '0'
81
+ version: '3.0'
111
82
  - !ruby/object:Gem::Dependency
112
- name: sinatra
83
+ name: rspec-command
113
84
  requirement: !ruby/object:Gem::Requirement
114
85
  requirements:
115
86
  - - ">="
@@ -123,13 +94,13 @@ dependencies:
123
94
  - !ruby/object:Gem::Version
124
95
  version: '0'
125
96
  - !ruby/object:Gem::Dependency
126
- name: tor
97
+ name: sinatra
127
98
  requirement: !ruby/object:Gem::Requirement
128
99
  requirements:
129
100
  - - ">="
130
101
  - !ruby/object:Gem::Version
131
102
  version: '0'
132
- type: :runtime
103
+ type: :development
133
104
  prerelease: false
134
105
  version_requirements: !ruby/object:Gem::Requirement
135
106
  requirements:
@@ -140,16 +111,16 @@ dependencies:
140
111
  name: activesupport
141
112
  requirement: !ruby/object:Gem::Requirement
142
113
  requirements:
143
- - - ">="
114
+ - - "~>"
144
115
  - !ruby/object:Gem::Version
145
- version: '0'
116
+ version: '7.1'
146
117
  type: :runtime
147
118
  prerelease: false
148
119
  version_requirements: !ruby/object:Gem::Requirement
149
120
  requirements:
150
- - - ">="
121
+ - - "~>"
151
122
  - !ruby/object:Gem::Version
152
- version: '0'
123
+ version: '7.1'
153
124
  - !ruby/object:Gem::Dependency
154
125
  name: mechanize
155
126
  requirement: !ruby/object:Gem::Requirement
@@ -179,7 +150,7 @@ dependencies:
179
150
  - !ruby/object:Gem::Version
180
151
  version: '0'
181
152
  - !ruby/object:Gem::Dependency
182
- name: pry
153
+ name: tor
183
154
  requirement: !ruby/object:Gem::Requirement
184
155
  requirements:
185
156
  - - ">="
@@ -192,7 +163,6 @@ dependencies:
192
163
  - - ">="
193
164
  - !ruby/object:Gem::Version
194
165
  version: '0'
195
- description:
196
166
  email:
197
167
  - aileron.cc@gmail.com
198
168
  executables:
@@ -207,6 +177,7 @@ files:
207
177
  - ".ruby-version"
208
178
  - ".travis.yml"
209
179
  - CHANGELOG.md
180
+ - CLAUDE.md
210
181
  - CODE_OF_CONDUCT.md
211
182
  - Gemfile
212
183
  - Gemfile.lock
@@ -222,6 +193,7 @@ files:
222
193
  - exe/spidy
223
194
  - lib/spidy.rb
224
195
  - lib/spidy/binder.rb
196
+ - lib/spidy/binder/error.rb
225
197
  - lib/spidy/binder/html.rb
226
198
  - lib/spidy/binder/json.rb
227
199
  - lib/spidy/binder/xml.rb
@@ -232,9 +204,9 @@ files:
232
204
  - lib/spidy/connector/json.rb
233
205
  - lib/spidy/connector/xml.rb
234
206
  - lib/spidy/console.rb
235
- - lib/spidy/define_object.rb
236
207
  - lib/spidy/definition.rb
237
208
  - lib/spidy/definition_file.rb
209
+ - lib/spidy/definition_object.rb
238
210
  - lib/spidy/shell.rb
239
211
  - lib/spidy/spider.rb
240
212
  - lib/spidy/version.rb
@@ -243,8 +215,8 @@ files:
243
215
  homepage: https://github.com/aileron-inc/spidy
244
216
  licenses:
245
217
  - MIT
246
- metadata: {}
247
- post_install_message:
218
+ metadata:
219
+ rubygems_mfa_required: 'true'
248
220
  rdoc_options: []
249
221
  require_paths:
250
222
  - lib
@@ -259,8 +231,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
259
231
  - !ruby/object:Gem::Version
260
232
  version: '0'
261
233
  requirements: []
262
- rubygems_version: 3.2.22
263
- signing_key:
234
+ rubygems_version: 3.6.5
264
235
  specification_version: 4
265
236
  summary: web spider dsl
266
237
  test_files: []