spidy 0.0.24 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +3 -1
- data/lib/spidy/binder/html.rb +6 -7
- data/lib/spidy/binder/json.rb +1 -2
- data/lib/spidy/binder/xml.rb +1 -2
- data/lib/spidy/connector.rb +10 -0
- data/lib/spidy/connector/direct.rb +10 -0
- data/lib/spidy/connector/html.rb +3 -11
- data/lib/spidy/connector/json.rb +2 -2
- data/lib/spidy/connector/xml.rb +3 -2
- data/lib/spidy/console.rb +4 -4
- data/lib/spidy/definition.rb +12 -24
- data/lib/spidy/shell.rb +29 -7
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +1 -0
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2418b53cec63dbd0cebe2dd7b0d10e9be1ea1177cd4be3584d3f04e2adbf6c10
|
4
|
+
data.tar.gz: bc8b78146e0f1f08b9b6fb052093ba92ca03aecf546a8ca9b079eab5991b8acf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb04e182f956b07c7bfc2290dabacdca588a940ba7aacfea1b417be94e7332bb5d9793e698150455b7fd484c2e745015c3cd4efe66f697f92710275d322376f1
|
7
|
+
data.tar.gz: 5cef25c2ca15ace3137cedb5181da48a95c67390304a4ab28c92f57de05801c4492296578e275cbcea51a597fa501a0ef661c20f4c0e67075dc492be57e558fa
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.0.
|
4
|
+
spidy (0.0.24)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
@@ -21,6 +21,7 @@ GEM
|
|
21
21
|
diff-lcs (1.3)
|
22
22
|
domain_name (0.5.20190701)
|
23
23
|
unf (>= 0.0.5, < 1.0.0)
|
24
|
+
ffaker (2.10.0)
|
24
25
|
http-cookie (1.0.3)
|
25
26
|
domain_name (~> 0.5)
|
26
27
|
i18n (1.7.0)
|
@@ -77,6 +78,7 @@ PLATFORMS
|
|
77
78
|
|
78
79
|
DEPENDENCIES
|
79
80
|
bundler (~> 2.0)
|
81
|
+
ffaker
|
80
82
|
pry
|
81
83
|
rake (~> 10.0)
|
82
84
|
rspec (~> 3.0)
|
data/lib/spidy/binder/html.rb
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::Binder::Html
|
7
7
|
class << self
|
8
|
-
attr_reader :
|
8
|
+
attr_reader :attribute_names
|
9
9
|
|
10
|
-
@names = []
|
11
10
|
def let(name, query = nil, &block)
|
12
|
-
@
|
11
|
+
@attribute_names ||= []
|
12
|
+
@attribute_names << name
|
13
13
|
define_method(name) do
|
14
14
|
return html.at(query)&.text if block.nil?
|
15
15
|
return instance_exec(&block) if query.blank?
|
@@ -21,12 +21,11 @@ class Spidy::Binder::Html
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
attr_reader :html, :
|
24
|
+
attr_reader :html, :url
|
25
25
|
|
26
26
|
def initialize(html, url: nil)
|
27
|
-
@html = html
|
28
27
|
@url = url
|
29
|
-
@
|
28
|
+
@html = html
|
30
29
|
end
|
31
30
|
|
32
31
|
def to_s
|
@@ -34,6 +33,6 @@ class Spidy::Binder::Html
|
|
34
33
|
end
|
35
34
|
|
36
35
|
def to_h
|
37
|
-
self.class.
|
36
|
+
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
38
37
|
end
|
39
38
|
end
|
data/lib/spidy/binder/json.rb
CHANGED
data/lib/spidy/binder/xml.rb
CHANGED
data/lib/spidy/connector.rb
CHANGED
@@ -5,10 +5,20 @@
|
|
5
5
|
#
|
6
6
|
module Spidy::Connector
|
7
7
|
extend ActiveSupport::Autoload
|
8
|
+
autoload :Direct
|
8
9
|
autoload :Html
|
9
10
|
autoload :Json
|
10
11
|
autoload :Xml
|
11
12
|
|
13
|
+
USER_AGENT = [
|
14
|
+
'Mozilla/5.0',
|
15
|
+
'(Macintosh; Intel Mac OS X 10_12_6)',
|
16
|
+
'AppleWebKit/537.36',
|
17
|
+
'(KHTML, like Gecko)',
|
18
|
+
'Chrome/64.0.3282.186',
|
19
|
+
'Safari/537.36'
|
20
|
+
].join(' ')
|
21
|
+
|
12
22
|
def self.get(value)
|
13
23
|
return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
|
14
24
|
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -18,18 +18,10 @@ module Spidy::Connector::Html
|
|
18
18
|
@response_code = error.try(:response_code) || page.try(:response_code)
|
19
19
|
end
|
20
20
|
end
|
21
|
-
USER_AGENT = [
|
22
|
-
'Mozilla/5.0',
|
23
|
-
'(Macintosh; Intel Mac OS X 10_12_6)',
|
24
|
-
'AppleWebKit/537.36',
|
25
|
-
'(KHTML, like Gecko)',
|
26
|
-
'Chrome/64.0.3282.186',
|
27
|
-
'Safari/537.36'
|
28
|
-
].join(' ')
|
29
21
|
|
30
|
-
@logger = proc { |values| STDERR.puts(values.
|
22
|
+
@logger = proc { |values| STDERR.puts(values.to_json) }
|
31
23
|
@agent = Mechanize.new
|
32
|
-
@agent.user_agent = USER_AGENT
|
24
|
+
@agent.user_agent = Spidy::Connector::USER_AGENT
|
33
25
|
|
34
26
|
class << self
|
35
27
|
attr_reader :agent
|
@@ -56,7 +48,7 @@ module Spidy::Connector::Html
|
|
56
48
|
'retry.rest_count': retry_count)
|
57
49
|
|
58
50
|
@agent = Mechanize.new
|
59
|
-
@agent.user_agent = USER_AGENT
|
51
|
+
@agent.user_agent = Spidy::Connector::USER_AGENT
|
60
52
|
|
61
53
|
retry_count -= 1
|
62
54
|
if retry_count.positive?
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -4,8 +4,8 @@
|
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
6
|
module Spidy::Connector::Json
|
7
|
-
def self.call(url
|
7
|
+
def self.call(url)
|
8
8
|
fail 'url is not specified' if url.blank?
|
9
|
-
OpenURI.open_uri(url) { |body|
|
9
|
+
OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
10
10
|
end
|
11
11
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -7,7 +7,8 @@ module Spidy::Connector::Xml
|
|
7
7
|
def self.call(url)
|
8
8
|
fail 'URL is undefined' if url.blank?
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
|
11
|
+
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
|
+
end
|
12
13
|
end
|
13
14
|
end
|
data/lib/spidy/console.rb
CHANGED
@@ -19,11 +19,11 @@ class Spidy::Console
|
|
19
19
|
@definition_file&.eval_definition
|
20
20
|
end
|
21
21
|
|
22
|
-
def call(name =
|
23
|
-
spidy.call(name, url: url, &block)
|
22
|
+
def call(name = :default, url: nil, &block)
|
23
|
+
spidy.call(name: name, url: url, &block)
|
24
24
|
end
|
25
25
|
|
26
|
-
def each(name =
|
27
|
-
spidy.each(name, url: url, &block)
|
26
|
+
def each(name = :default, url: nil, &block)
|
27
|
+
spidy.each(name: name, url: url, &block)
|
28
28
|
end
|
29
29
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -8,27 +8,27 @@ module Spidy::Definition
|
|
8
8
|
@namespace
|
9
9
|
end
|
10
10
|
|
11
|
-
def call(
|
11
|
+
def call(source = nil, name: :default, &yielder)
|
12
12
|
name = name.presence || :default
|
13
13
|
spidy = @namespace[:"#{name}_scraper"]
|
14
14
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
15
15
|
|
16
|
-
|
16
|
+
spidy.call(source, &yielder)
|
17
17
|
end
|
18
18
|
|
19
|
-
def each(
|
19
|
+
def each(source = nil, name: :default, &yielder)
|
20
20
|
name = name.presence || :default
|
21
21
|
spidy = @namespace[:"#{name}_spider"]
|
22
22
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
23
23
|
|
24
|
-
|
24
|
+
spidy.call(source, &yielder)
|
25
25
|
end
|
26
26
|
|
27
|
-
def spider(name = :default, connector: nil, as: nil)
|
27
|
+
def spider(name = :default, connector: nil, as: nil, &define_block)
|
28
28
|
@namespace ||= {}
|
29
|
-
connector = Spidy::Connector.get(
|
30
|
-
@namespace[:"#{name}_spider"] = proc do |
|
31
|
-
|
29
|
+
connector = Spidy::Connector.get(connector || as)
|
30
|
+
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
31
|
+
define_block.call(yielder, connector, source)
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -41,26 +41,14 @@ module Spidy::Definition
|
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
|
-
def exec(spidy, url: nil, stream: nil, err: nil, &output)
|
45
|
-
return spidy.call(url, &output) if stream.nil?
|
46
|
-
|
47
|
-
stream.each do |value|
|
48
|
-
spidy.call(value.strip, &output)
|
49
|
-
rescue StandardError => e
|
50
|
-
raise e if err.nil?
|
51
|
-
|
52
|
-
err.call(e, value.strip)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
44
|
def define_proc(connector, binder, define_block)
|
57
|
-
proc do |
|
45
|
+
proc do |source, &yielder|
|
58
46
|
fail 'block is not specified' if yielder.nil?
|
59
47
|
|
60
|
-
connection_yielder = lambda do |
|
61
|
-
binder.call(
|
48
|
+
connection_yielder = lambda do |page|
|
49
|
+
binder.call(page, url: source, define: define_block) { |object| yielder.call(object) }
|
62
50
|
end
|
63
|
-
connector.call(
|
51
|
+
connector.call(source, &connection_yielder)
|
64
52
|
end
|
65
53
|
end
|
66
54
|
end
|
data/lib/spidy/shell.rb
CHANGED
@@ -8,23 +8,45 @@ require 'pry'
|
|
8
8
|
class Spidy::Shell
|
9
9
|
attr_reader :definition_file, :spidy
|
10
10
|
class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
|
11
|
-
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts(
|
11
|
+
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
|
12
12
|
delegate :spidy, to: :definition_file
|
13
13
|
|
14
14
|
def initialize(definition_file)
|
15
15
|
@definition_file = definition_file
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
19
|
-
|
18
|
+
def each_stdin_lines(name)
|
19
|
+
STDIN.each_line do |url|
|
20
|
+
begin
|
21
|
+
spidy.each(url.strip, name: name, &output)
|
22
|
+
rescue => e
|
23
|
+
error_handler.call(e, url)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def call_stdin_lines(name)
|
29
|
+
STDIN.each_line do |url|
|
30
|
+
begin
|
31
|
+
spidy.call(url.strip, name: name, &output)
|
32
|
+
rescue => e
|
33
|
+
error_handler.call(e, url)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
20
37
|
|
21
|
-
|
38
|
+
def call(name)
|
39
|
+
return call_stdin_lines(name) if FileTest.pipe?(STDIN)
|
40
|
+
spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
|
41
|
+
rescue => e
|
42
|
+
error_handler.call(e, nil)
|
22
43
|
end
|
23
44
|
|
24
45
|
def each(name)
|
25
|
-
return
|
26
|
-
|
27
|
-
|
46
|
+
return each_stdin_lines(name) if FileTest.pipe?(STDIN)
|
47
|
+
spidy.each(name: name, &output)
|
48
|
+
rescue => e
|
49
|
+
error_handler.call(e, nil)
|
28
50
|
end
|
29
51
|
|
30
52
|
def function
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_development_dependency 'pry'
|
29
29
|
spec.add_development_dependency 'rake', '~> 10.0'
|
30
30
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
31
|
+
spec.add_development_dependency 'ffaker'
|
31
32
|
|
32
33
|
spec.add_runtime_dependency 'activesupport'
|
33
34
|
spec.add_runtime_dependency 'mechanize'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ffaker
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: activesupport
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,6 +152,7 @@ files:
|
|
138
152
|
- lib/spidy/binder/json.rb
|
139
153
|
- lib/spidy/binder/xml.rb
|
140
154
|
- lib/spidy/connector.rb
|
155
|
+
- lib/spidy/connector/direct.rb
|
141
156
|
- lib/spidy/connector/html.rb
|
142
157
|
- lib/spidy/connector/json.rb
|
143
158
|
- lib/spidy/connector/xml.rb
|
@@ -168,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
183
|
- !ruby/object:Gem::Version
|
169
184
|
version: '0'
|
170
185
|
requirements: []
|
171
|
-
rubygems_version: 3.
|
186
|
+
rubygems_version: 3.1.2
|
172
187
|
signing_key:
|
173
188
|
specification_version: 4
|
174
189
|
summary: web spider dsl
|