spidy 0.0.24 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +3 -1
- data/lib/spidy/binder/html.rb +6 -7
- data/lib/spidy/binder/json.rb +1 -2
- data/lib/spidy/binder/xml.rb +1 -2
- data/lib/spidy/connector.rb +10 -0
- data/lib/spidy/connector/direct.rb +10 -0
- data/lib/spidy/connector/html.rb +3 -11
- data/lib/spidy/connector/json.rb +2 -2
- data/lib/spidy/connector/xml.rb +3 -2
- data/lib/spidy/console.rb +4 -4
- data/lib/spidy/definition.rb +12 -24
- data/lib/spidy/shell.rb +29 -7
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +1 -0
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2418b53cec63dbd0cebe2dd7b0d10e9be1ea1177cd4be3584d3f04e2adbf6c10
|
4
|
+
data.tar.gz: bc8b78146e0f1f08b9b6fb052093ba92ca03aecf546a8ca9b079eab5991b8acf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb04e182f956b07c7bfc2290dabacdca588a940ba7aacfea1b417be94e7332bb5d9793e698150455b7fd484c2e745015c3cd4efe66f697f92710275d322376f1
|
7
|
+
data.tar.gz: 5cef25c2ca15ace3137cedb5181da48a95c67390304a4ab28c92f57de05801c4492296578e275cbcea51a597fa501a0ef661c20f4c0e67075dc492be57e558fa
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.0.
|
4
|
+
spidy (0.0.24)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
@@ -21,6 +21,7 @@ GEM
|
|
21
21
|
diff-lcs (1.3)
|
22
22
|
domain_name (0.5.20190701)
|
23
23
|
unf (>= 0.0.5, < 1.0.0)
|
24
|
+
ffaker (2.10.0)
|
24
25
|
http-cookie (1.0.3)
|
25
26
|
domain_name (~> 0.5)
|
26
27
|
i18n (1.7.0)
|
@@ -77,6 +78,7 @@ PLATFORMS
|
|
77
78
|
|
78
79
|
DEPENDENCIES
|
79
80
|
bundler (~> 2.0)
|
81
|
+
ffaker
|
80
82
|
pry
|
81
83
|
rake (~> 10.0)
|
82
84
|
rspec (~> 3.0)
|
data/lib/spidy/binder/html.rb
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::Binder::Html
|
7
7
|
class << self
|
8
|
-
attr_reader :
|
8
|
+
attr_reader :attribute_names
|
9
9
|
|
10
|
-
@names = []
|
11
10
|
def let(name, query = nil, &block)
|
12
|
-
@
|
11
|
+
@attribute_names ||= []
|
12
|
+
@attribute_names << name
|
13
13
|
define_method(name) do
|
14
14
|
return html.at(query)&.text if block.nil?
|
15
15
|
return instance_exec(&block) if query.blank?
|
@@ -21,12 +21,11 @@ class Spidy::Binder::Html
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
attr_reader :html, :
|
24
|
+
attr_reader :html, :url
|
25
25
|
|
26
26
|
def initialize(html, url: nil)
|
27
|
-
@html = html
|
28
27
|
@url = url
|
29
|
-
@
|
28
|
+
@html = html
|
30
29
|
end
|
31
30
|
|
32
31
|
def to_s
|
@@ -34,6 +33,6 @@ class Spidy::Binder::Html
|
|
34
33
|
end
|
35
34
|
|
36
35
|
def to_h
|
37
|
-
self.class.
|
36
|
+
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
38
37
|
end
|
39
38
|
end
|
data/lib/spidy/binder/json.rb
CHANGED
data/lib/spidy/binder/xml.rb
CHANGED
data/lib/spidy/connector.rb
CHANGED
@@ -5,10 +5,20 @@
|
|
5
5
|
#
|
6
6
|
module Spidy::Connector
|
7
7
|
extend ActiveSupport::Autoload
|
8
|
+
autoload :Direct
|
8
9
|
autoload :Html
|
9
10
|
autoload :Json
|
10
11
|
autoload :Xml
|
11
12
|
|
13
|
+
USER_AGENT = [
|
14
|
+
'Mozilla/5.0',
|
15
|
+
'(Macintosh; Intel Mac OS X 10_12_6)',
|
16
|
+
'AppleWebKit/537.36',
|
17
|
+
'(KHTML, like Gecko)',
|
18
|
+
'Chrome/64.0.3282.186',
|
19
|
+
'Safari/537.36'
|
20
|
+
].join(' ')
|
21
|
+
|
12
22
|
def self.get(value)
|
13
23
|
return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
|
14
24
|
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -18,18 +18,10 @@ module Spidy::Connector::Html
|
|
18
18
|
@response_code = error.try(:response_code) || page.try(:response_code)
|
19
19
|
end
|
20
20
|
end
|
21
|
-
USER_AGENT = [
|
22
|
-
'Mozilla/5.0',
|
23
|
-
'(Macintosh; Intel Mac OS X 10_12_6)',
|
24
|
-
'AppleWebKit/537.36',
|
25
|
-
'(KHTML, like Gecko)',
|
26
|
-
'Chrome/64.0.3282.186',
|
27
|
-
'Safari/537.36'
|
28
|
-
].join(' ')
|
29
21
|
|
30
|
-
@logger = proc { |values| STDERR.puts(values.
|
22
|
+
@logger = proc { |values| STDERR.puts(values.to_json) }
|
31
23
|
@agent = Mechanize.new
|
32
|
-
@agent.user_agent = USER_AGENT
|
24
|
+
@agent.user_agent = Spidy::Connector::USER_AGENT
|
33
25
|
|
34
26
|
class << self
|
35
27
|
attr_reader :agent
|
@@ -56,7 +48,7 @@ module Spidy::Connector::Html
|
|
56
48
|
'retry.rest_count': retry_count)
|
57
49
|
|
58
50
|
@agent = Mechanize.new
|
59
|
-
@agent.user_agent = USER_AGENT
|
51
|
+
@agent.user_agent = Spidy::Connector::USER_AGENT
|
60
52
|
|
61
53
|
retry_count -= 1
|
62
54
|
if retry_count.positive?
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -4,8 +4,8 @@
|
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
6
|
module Spidy::Connector::Json
|
7
|
-
def self.call(url
|
7
|
+
def self.call(url)
|
8
8
|
fail 'url is not specified' if url.blank?
|
9
|
-
OpenURI.open_uri(url) { |body|
|
9
|
+
OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
10
10
|
end
|
11
11
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -7,7 +7,8 @@ module Spidy::Connector::Xml
|
|
7
7
|
def self.call(url)
|
8
8
|
fail 'URL is undefined' if url.blank?
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
|
11
|
+
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
|
+
end
|
12
13
|
end
|
13
14
|
end
|
data/lib/spidy/console.rb
CHANGED
@@ -19,11 +19,11 @@ class Spidy::Console
|
|
19
19
|
@definition_file&.eval_definition
|
20
20
|
end
|
21
21
|
|
22
|
-
def call(name =
|
23
|
-
spidy.call(name, url: url, &block)
|
22
|
+
def call(name = :default, url: nil, &block)
|
23
|
+
spidy.call(name: name, url: url, &block)
|
24
24
|
end
|
25
25
|
|
26
|
-
def each(name =
|
27
|
-
spidy.each(name, url: url, &block)
|
26
|
+
def each(name = :default, url: nil, &block)
|
27
|
+
spidy.each(name: name, url: url, &block)
|
28
28
|
end
|
29
29
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -8,27 +8,27 @@ module Spidy::Definition
|
|
8
8
|
@namespace
|
9
9
|
end
|
10
10
|
|
11
|
-
def call(
|
11
|
+
def call(source = nil, name: :default, &yielder)
|
12
12
|
name = name.presence || :default
|
13
13
|
spidy = @namespace[:"#{name}_scraper"]
|
14
14
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
15
15
|
|
16
|
-
|
16
|
+
spidy.call(source, &yielder)
|
17
17
|
end
|
18
18
|
|
19
|
-
def each(
|
19
|
+
def each(source = nil, name: :default, &yielder)
|
20
20
|
name = name.presence || :default
|
21
21
|
spidy = @namespace[:"#{name}_spider"]
|
22
22
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
23
23
|
|
24
|
-
|
24
|
+
spidy.call(source, &yielder)
|
25
25
|
end
|
26
26
|
|
27
|
-
def spider(name = :default, connector: nil, as: nil)
|
27
|
+
def spider(name = :default, connector: nil, as: nil, &define_block)
|
28
28
|
@namespace ||= {}
|
29
|
-
connector = Spidy::Connector.get(
|
30
|
-
@namespace[:"#{name}_spider"] = proc do |
|
31
|
-
|
29
|
+
connector = Spidy::Connector.get(connector || as)
|
30
|
+
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
31
|
+
define_block.call(yielder, connector, source)
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -41,26 +41,14 @@ module Spidy::Definition
|
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
|
-
def exec(spidy, url: nil, stream: nil, err: nil, &output)
|
45
|
-
return spidy.call(url, &output) if stream.nil?
|
46
|
-
|
47
|
-
stream.each do |value|
|
48
|
-
spidy.call(value.strip, &output)
|
49
|
-
rescue StandardError => e
|
50
|
-
raise e if err.nil?
|
51
|
-
|
52
|
-
err.call(e, value.strip)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
44
|
def define_proc(connector, binder, define_block)
|
57
|
-
proc do |
|
45
|
+
proc do |source, &yielder|
|
58
46
|
fail 'block is not specified' if yielder.nil?
|
59
47
|
|
60
|
-
connection_yielder = lambda do |
|
61
|
-
binder.call(
|
48
|
+
connection_yielder = lambda do |page|
|
49
|
+
binder.call(page, url: source, define: define_block) { |object| yielder.call(object) }
|
62
50
|
end
|
63
|
-
connector.call(
|
51
|
+
connector.call(source, &connection_yielder)
|
64
52
|
end
|
65
53
|
end
|
66
54
|
end
|
data/lib/spidy/shell.rb
CHANGED
@@ -8,23 +8,45 @@ require 'pry'
|
|
8
8
|
class Spidy::Shell
|
9
9
|
attr_reader :definition_file, :spidy
|
10
10
|
class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
|
11
|
-
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts(
|
11
|
+
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
|
12
12
|
delegate :spidy, to: :definition_file
|
13
13
|
|
14
14
|
def initialize(definition_file)
|
15
15
|
@definition_file = definition_file
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
19
|
-
|
18
|
+
def each_stdin_lines(name)
|
19
|
+
STDIN.each_line do |url|
|
20
|
+
begin
|
21
|
+
spidy.each(url.strip, name: name, &output)
|
22
|
+
rescue => e
|
23
|
+
error_handler.call(e, url)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def call_stdin_lines(name)
|
29
|
+
STDIN.each_line do |url|
|
30
|
+
begin
|
31
|
+
spidy.call(url.strip, name: name, &output)
|
32
|
+
rescue => e
|
33
|
+
error_handler.call(e, url)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
20
37
|
|
21
|
-
|
38
|
+
def call(name)
|
39
|
+
return call_stdin_lines(name) if FileTest.pipe?(STDIN)
|
40
|
+
spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
|
41
|
+
rescue => e
|
42
|
+
error_handler.call(e, nil)
|
22
43
|
end
|
23
44
|
|
24
45
|
def each(name)
|
25
|
-
return
|
26
|
-
|
27
|
-
|
46
|
+
return each_stdin_lines(name) if FileTest.pipe?(STDIN)
|
47
|
+
spidy.each(name: name, &output)
|
48
|
+
rescue => e
|
49
|
+
error_handler.call(e, nil)
|
28
50
|
end
|
29
51
|
|
30
52
|
def function
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_development_dependency 'pry'
|
29
29
|
spec.add_development_dependency 'rake', '~> 10.0'
|
30
30
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
31
|
+
spec.add_development_dependency 'ffaker'
|
31
32
|
|
32
33
|
spec.add_runtime_dependency 'activesupport'
|
33
34
|
spec.add_runtime_dependency 'mechanize'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ffaker
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: activesupport
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,6 +152,7 @@ files:
|
|
138
152
|
- lib/spidy/binder/json.rb
|
139
153
|
- lib/spidy/binder/xml.rb
|
140
154
|
- lib/spidy/connector.rb
|
155
|
+
- lib/spidy/connector/direct.rb
|
141
156
|
- lib/spidy/connector/html.rb
|
142
157
|
- lib/spidy/connector/json.rb
|
143
158
|
- lib/spidy/connector/xml.rb
|
@@ -168,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
183
|
- !ruby/object:Gem::Version
|
169
184
|
version: '0'
|
170
185
|
requirements: []
|
171
|
-
rubygems_version: 3.
|
186
|
+
rubygems_version: 3.1.2
|
172
187
|
signing_key:
|
173
188
|
specification_version: 4
|
174
189
|
summary: web spider dsl
|