spidy 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/Gemfile.lock +1 -1
- data/README.md +38 -1
- data/lib/spidy.rb +7 -1
- data/lib/spidy/connector/html.rb +39 -9
- data/lib/spidy/console.rb +3 -3
- data/lib/spidy/definition.rb +48 -23
- data/lib/spidy/definition_file.rb +2 -9
- data/lib/spidy/shell.rb +10 -18
- data/lib/spidy/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d88e7028d82156f7710e85af609aa8c703a92b5d48f92af5047b39f64e98b4ad
|
4
|
+
data.tar.gz: 984a44e45d06d72a0e8eaaa3c142f13fe81c5f77b0606b4dc883b695ae024f7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2c6dafc543a85fc7c58ca9d7b3b9dc9fe3823fbe35aad3dfa60b7fece6434e3b0c4fbef6b507bd73ed7e827611a01d065b89df0b9d33e06b203707801eddaee
|
7
|
+
data.tar.gz: 82466dba270ffbec392383edd7afdad91d6c742d3580b733ef489e5199fd5c44956b70bd43bcb64d9e91b161e7aedf18fef96c4965ae655cece11b9ab585ebfc
|
data/.rubocop.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,44 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
23
|
+
### When used from the command line
|
24
|
+
|
25
|
+
website.rb
|
26
|
+
```rb
|
27
|
+
Spidy.defin do
|
28
|
+
spider(as: :html) do |yielder, connector, url|
|
29
|
+
connector.call(url) do |html|
|
30
|
+
# html as nokogiri object ( mechanize )
|
31
|
+
yielder.call(url)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
define(as: :html) do
|
36
|
+
let(:object_name, 'nokogiri query')
|
37
|
+
end
|
38
|
+
end
|
39
|
+
```
|
40
|
+
```bash
|
41
|
+
echo 'http://example.com' | spidy each website.rb > urls
|
42
|
+
cat urls | spidy call website.rb > website.json
|
43
|
+
# shorthands
|
44
|
+
echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
|
45
|
+
```
|
46
|
+
|
47
|
+
### When used from the ruby code
|
48
|
+
``
|
49
|
+
a = Spidy.define do
|
50
|
+
# Implementing spiders and scrapers
|
51
|
+
end
|
52
|
+
|
53
|
+
a.each(url) do |url|
|
54
|
+
# Loop for the number of retrieved URLs
|
55
|
+
end
|
56
|
+
|
57
|
+
a.call(url) do |object|
|
58
|
+
# The scrape result is passed as a defined object
|
59
|
+
end
|
60
|
+
```
|
24
61
|
|
25
62
|
## Development
|
26
63
|
|
data/lib/spidy.rb
CHANGED
@@ -29,6 +29,12 @@ module Spidy
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.define(&block)
|
32
|
-
|
32
|
+
Module.new do
|
33
|
+
class_eval do
|
34
|
+
extend ::Spidy::Definition
|
35
|
+
module_eval(&block)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# Module.new(::Spidy::Definition, &block)
|
33
39
|
end
|
34
40
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -4,6 +4,18 @@
|
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
6
|
module Spidy::Connector::Html
|
7
|
+
#
|
8
|
+
# retry class
|
9
|
+
#
|
10
|
+
class Retry < StandardError
|
11
|
+
attr_reader :response_code
|
12
|
+
attr_reader :wait_time
|
13
|
+
|
14
|
+
def initialize(wait_time: 2, page: nil, error: nil)
|
15
|
+
@wait_time = wait_time
|
16
|
+
@response_code = error.try(:response_code) || page.try(:response_code)
|
17
|
+
end
|
18
|
+
end
|
7
19
|
USER_AGENT = [
|
8
20
|
'Mozilla/5.0',
|
9
21
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
@@ -13,33 +25,51 @@ module Spidy::Connector::Html
|
|
13
25
|
'Safari/537.36'
|
14
26
|
].join(' ')
|
15
27
|
|
28
|
+
@logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
|
16
29
|
@agent = Mechanize.new
|
17
30
|
@agent.user_agent = USER_AGENT
|
18
31
|
|
19
32
|
class << self
|
20
33
|
attr_reader :agent
|
34
|
+
attr_accessor :logger
|
21
35
|
|
22
36
|
def call(url, encoding: nil, retry_count: 3, &yielder)
|
23
37
|
if encoding
|
24
|
-
|
25
|
-
|
38
|
+
agent.default_encoding = encoding
|
39
|
+
agent.force_default_encoding = true
|
26
40
|
end
|
41
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
27
42
|
get(url, retry_count, yielder)
|
28
43
|
end
|
29
44
|
|
30
45
|
private
|
31
46
|
|
47
|
+
# rubocop:disable Metrics/MethodLength
|
32
48
|
def get(url, retry_count, yielder)
|
33
|
-
|
49
|
+
agent.get(url) do |page|
|
50
|
+
fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
|
51
|
+
|
52
|
+
yielder.call(page)
|
53
|
+
end
|
34
54
|
rescue Mechanize::ResponseCodeError => e
|
55
|
+
raise Retry, error: e if e.response_code == '429'
|
56
|
+
raise e
|
57
|
+
rescue Retry => e
|
58
|
+
logger.call('retry.accessed': Time.current,
|
59
|
+
'retry.uri': url,
|
60
|
+
'retry.response_code': e.response_code,
|
61
|
+
'retry.rest_count': retry_count)
|
62
|
+
|
63
|
+
@agent = Mechanize.new
|
64
|
+
@agent.user_agent = USER_AGENT
|
65
|
+
|
35
66
|
retry_count -= 1
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
retry if retry_count
|
40
|
-
else
|
41
|
-
raise e
|
67
|
+
if retry_count.positive?
|
68
|
+
sleep e.wait_time
|
69
|
+
retry
|
42
70
|
end
|
71
|
+
raise e
|
43
72
|
end
|
73
|
+
# rubocop:enable Metrics/MethodLength
|
44
74
|
end
|
45
75
|
end
|
data/lib/spidy/console.rb
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::Console
|
7
7
|
attr_reader :definition_file
|
8
|
-
delegate :
|
8
|
+
delegate :spidy, to: :definition_file
|
9
9
|
|
10
10
|
def initialize(definition_file = nil)
|
11
11
|
@definition_file = definition_file
|
@@ -20,10 +20,10 @@ class Spidy::Console
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def call(name = :default, url = nil, &block)
|
23
|
-
|
23
|
+
spidy.call(name, url: url, &block)
|
24
24
|
end
|
25
25
|
|
26
26
|
def each(name = :default, url = nil, &block)
|
27
|
-
|
27
|
+
spidy.each(name, url: url, &block)
|
28
28
|
end
|
29
29
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -3,35 +3,60 @@
|
|
3
3
|
#
|
4
4
|
# Class representing a website defined by DSL
|
5
5
|
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
yield(yielder, connector, url)
|
15
|
-
end
|
16
|
-
end
|
6
|
+
module Spidy::Definition
|
7
|
+
def call(name = :default, url: nil, stream: nil, err: nil, &output)
|
8
|
+
name = name.presence || :default
|
9
|
+
spidy = @namespace["#{name}_scraper"]
|
10
|
+
fail "undefined spidy [#{name}]" if spidy.nil?
|
11
|
+
|
12
|
+
exec(spidy, url: url, stream: stream, err: err, &output)
|
13
|
+
end
|
17
14
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def each(name = :default, url: nil, stream: nil, err: nil, &output)
|
16
|
+
name = name.presence || :default
|
17
|
+
spidy = @namespace["#{name}_spider"]
|
18
|
+
fail "undefined spidy [#{name}]" if spidy.nil?
|
19
|
+
|
20
|
+
exec(spidy, url: url, stream: stream, err: err, &output)
|
21
|
+
end
|
22
|
+
|
23
|
+
def spider(name = :default, connector: nil, as: nil)
|
24
|
+
@namespace ||= {}
|
25
|
+
connector = Spidy::Connector.get(as || connector) || connector
|
26
|
+
@namespace["#{name}_spider"] = proc do |url, &yielder|
|
27
|
+
yield(yielder, connector, url)
|
22
28
|
end
|
29
|
+
end
|
23
30
|
|
24
|
-
|
31
|
+
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
32
|
+
@namespace ||= {}
|
33
|
+
connector = Spidy::Connector.get(as || connector) || connector
|
34
|
+
binder = Spidy::Binder.get(as || binder) || binder
|
35
|
+
@namespace["#{name}_scraper"] = define_proc(connector, binder, define_block)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def exec(spidy, url: nil, stream: nil, err: nil, &output)
|
41
|
+
return spidy.call(url, &output) if stream.nil?
|
42
|
+
|
43
|
+
stream.each do |value|
|
44
|
+
spidy.call(value.strip, &output)
|
45
|
+
rescue StandardError => e
|
46
|
+
raise e if err.nil?
|
47
|
+
|
48
|
+
err.call(e, value.strip)
|
49
|
+
end
|
50
|
+
end
|
25
51
|
|
26
|
-
|
27
|
-
|
28
|
-
|
52
|
+
def define_proc(connector, binder, define_block)
|
53
|
+
proc do |url, &yielder|
|
54
|
+
fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
|
29
55
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
33
|
-
connector.call(url, &connection_yielder)
|
56
|
+
connection_yielder = lambda do |resource|
|
57
|
+
binder.call(resource, define_block) { |object| yielder.call(object) }
|
34
58
|
end
|
59
|
+
connector.call(url, &connection_yielder)
|
35
60
|
end
|
36
61
|
end
|
37
62
|
end
|
@@ -5,14 +5,7 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::DefinitionFile
|
7
7
|
attr_reader :path
|
8
|
-
attr_reader :
|
9
|
-
delegate :namespace, :spiders, to: :definition
|
10
|
-
|
11
|
-
CSV = lambda do |result|
|
12
|
-
::CSV.generate do |csv|
|
13
|
-
csv << result.definition.attributes_to_array
|
14
|
-
end
|
15
|
-
end
|
8
|
+
attr_reader :spidy
|
16
9
|
|
17
10
|
def self.open(filepath)
|
18
11
|
object = new(filepath)
|
@@ -22,7 +15,7 @@ class Spidy::DefinitionFile
|
|
22
15
|
|
23
16
|
# rubocop:disable Security/Eval
|
24
17
|
def eval_definition
|
25
|
-
@
|
18
|
+
@spidy = eval(File.open(path).read)
|
26
19
|
end
|
27
20
|
# rubocop:enable Security/Eval
|
28
21
|
|
data/lib/spidy/shell.rb
CHANGED
@@ -6,20 +6,25 @@ require 'pry'
|
|
6
6
|
# spidy shell interface
|
7
7
|
#
|
8
8
|
class Spidy::Shell
|
9
|
-
attr_reader :definition_file
|
10
|
-
class_attribute :
|
11
|
-
|
9
|
+
attr_reader :definition_file, :spidy
|
10
|
+
class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
|
11
|
+
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
|
12
|
+
delegate :spidy, to: :definition_file
|
12
13
|
|
13
14
|
def initialize(definition_file)
|
14
15
|
@definition_file = definition_file
|
15
16
|
end
|
16
17
|
|
17
18
|
def call(name)
|
18
|
-
|
19
|
+
return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
|
20
|
+
|
21
|
+
spidy.call(name, err: error_handler, &output)
|
19
22
|
end
|
20
23
|
|
21
24
|
def each(name)
|
22
|
-
|
25
|
+
return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
|
26
|
+
|
27
|
+
spidy.each(name, err: error_handler, &output)
|
23
28
|
end
|
24
29
|
|
25
30
|
def function
|
@@ -66,17 +71,4 @@ class Spidy::Shell
|
|
66
71
|
RUBY
|
67
72
|
end
|
68
73
|
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def exec(command)
|
73
|
-
fail "undefined commmand[#{name}]" if command.nil?
|
74
|
-
return command.call(&output_yielder) unless FileTest.pipe?(STDIN)
|
75
|
-
|
76
|
-
STDIN.each do |line|
|
77
|
-
command.call(line.strip, &output_yielder)
|
78
|
-
rescue StandardError => e
|
79
|
-
STDERR.puts("#{line.strip}\n #{e.message}")
|
80
|
-
end
|
81
|
-
end
|
82
74
|
end
|
data/lib/spidy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|