spidy 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/Gemfile.lock +1 -1
- data/README.md +38 -1
- data/lib/spidy.rb +7 -1
- data/lib/spidy/connector/html.rb +39 -9
- data/lib/spidy/console.rb +3 -3
- data/lib/spidy/definition.rb +48 -23
- data/lib/spidy/definition_file.rb +2 -9
- data/lib/spidy/shell.rb +10 -18
- data/lib/spidy/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d88e7028d82156f7710e85af609aa8c703a92b5d48f92af5047b39f64e98b4ad
|
4
|
+
data.tar.gz: 984a44e45d06d72a0e8eaaa3c142f13fe81c5f77b0606b4dc883b695ae024f7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2c6dafc543a85fc7c58ca9d7b3b9dc9fe3823fbe35aad3dfa60b7fece6434e3b0c4fbef6b507bd73ed7e827611a01d065b89df0b9d33e06b203707801eddaee
|
7
|
+
data.tar.gz: 82466dba270ffbec392383edd7afdad91d6c742d3580b733ef489e5199fd5c44956b70bd43bcb64d9e91b161e7aedf18fef96c4965ae655cece11b9ab585ebfc
|
data/.rubocop.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,44 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
23
|
+
### When used from the command line
|
24
|
+
|
25
|
+
website.rb
|
26
|
+
```rb
|
27
|
+
Spidy.defin do
|
28
|
+
spider(as: :html) do |yielder, connector, url|
|
29
|
+
connector.call(url) do |html|
|
30
|
+
# html as nokogiri object ( mechanize )
|
31
|
+
yielder.call(url)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
define(as: :html) do
|
36
|
+
let(:object_name, 'nokogiri query')
|
37
|
+
end
|
38
|
+
end
|
39
|
+
```
|
40
|
+
```bash
|
41
|
+
echo 'http://example.com' | spidy each website.rb > urls
|
42
|
+
cat urls | spidy call website.rb > website.json
|
43
|
+
# shorthands
|
44
|
+
echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
|
45
|
+
```
|
46
|
+
|
47
|
+
### When used from the ruby code
|
48
|
+
``
|
49
|
+
a = Spidy.define do
|
50
|
+
# Implementing spiders and scrapers
|
51
|
+
end
|
52
|
+
|
53
|
+
a.each(url) do |url|
|
54
|
+
# Loop for the number of retrieved URLs
|
55
|
+
end
|
56
|
+
|
57
|
+
a.call(url) do |object|
|
58
|
+
# The scrape result is passed as a defined object
|
59
|
+
end
|
60
|
+
```
|
24
61
|
|
25
62
|
## Development
|
26
63
|
|
data/lib/spidy.rb
CHANGED
@@ -29,6 +29,12 @@ module Spidy
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.define(&block)
|
32
|
-
|
32
|
+
Module.new do
|
33
|
+
class_eval do
|
34
|
+
extend ::Spidy::Definition
|
35
|
+
module_eval(&block)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# Module.new(::Spidy::Definition, &block)
|
33
39
|
end
|
34
40
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -4,6 +4,18 @@
|
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
6
|
module Spidy::Connector::Html
|
7
|
+
#
|
8
|
+
# retry class
|
9
|
+
#
|
10
|
+
class Retry < StandardError
|
11
|
+
attr_reader :response_code
|
12
|
+
attr_reader :wait_time
|
13
|
+
|
14
|
+
def initialize(wait_time: 2, page: nil, error: nil)
|
15
|
+
@wait_time = wait_time
|
16
|
+
@response_code = error.try(:response_code) || page.try(:response_code)
|
17
|
+
end
|
18
|
+
end
|
7
19
|
USER_AGENT = [
|
8
20
|
'Mozilla/5.0',
|
9
21
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
@@ -13,33 +25,51 @@ module Spidy::Connector::Html
|
|
13
25
|
'Safari/537.36'
|
14
26
|
].join(' ')
|
15
27
|
|
28
|
+
@logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
|
16
29
|
@agent = Mechanize.new
|
17
30
|
@agent.user_agent = USER_AGENT
|
18
31
|
|
19
32
|
class << self
|
20
33
|
attr_reader :agent
|
34
|
+
attr_accessor :logger
|
21
35
|
|
22
36
|
def call(url, encoding: nil, retry_count: 3, &yielder)
|
23
37
|
if encoding
|
24
|
-
|
25
|
-
|
38
|
+
agent.default_encoding = encoding
|
39
|
+
agent.force_default_encoding = true
|
26
40
|
end
|
41
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
27
42
|
get(url, retry_count, yielder)
|
28
43
|
end
|
29
44
|
|
30
45
|
private
|
31
46
|
|
47
|
+
# rubocop:disable Metrics/MethodLength
|
32
48
|
def get(url, retry_count, yielder)
|
33
|
-
|
49
|
+
agent.get(url) do |page|
|
50
|
+
fail Retry, page: page, wait_time: 5 if page.title == 'Sorry, unable to access page...'
|
51
|
+
|
52
|
+
yielder.call(page)
|
53
|
+
end
|
34
54
|
rescue Mechanize::ResponseCodeError => e
|
55
|
+
raise Retry, error: e if e.response_code == '429'
|
56
|
+
raise e
|
57
|
+
rescue Retry => e
|
58
|
+
logger.call('retry.accessed': Time.current,
|
59
|
+
'retry.uri': url,
|
60
|
+
'retry.response_code': e.response_code,
|
61
|
+
'retry.rest_count': retry_count)
|
62
|
+
|
63
|
+
@agent = Mechanize.new
|
64
|
+
@agent.user_agent = USER_AGENT
|
65
|
+
|
35
66
|
retry_count -= 1
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
retry if retry_count
|
40
|
-
else
|
41
|
-
raise e
|
67
|
+
if retry_count.positive?
|
68
|
+
sleep e.wait_time
|
69
|
+
retry
|
42
70
|
end
|
71
|
+
raise e
|
43
72
|
end
|
73
|
+
# rubocop:enable Metrics/MethodLength
|
44
74
|
end
|
45
75
|
end
|
data/lib/spidy/console.rb
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::Console
|
7
7
|
attr_reader :definition_file
|
8
|
-
delegate :
|
8
|
+
delegate :spidy, to: :definition_file
|
9
9
|
|
10
10
|
def initialize(definition_file = nil)
|
11
11
|
@definition_file = definition_file
|
@@ -20,10 +20,10 @@ class Spidy::Console
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def call(name = :default, url = nil, &block)
|
23
|
-
|
23
|
+
spidy.call(name, url: url, &block)
|
24
24
|
end
|
25
25
|
|
26
26
|
def each(name = :default, url = nil, &block)
|
27
|
-
|
27
|
+
spidy.each(name, url: url, &block)
|
28
28
|
end
|
29
29
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -3,35 +3,60 @@
|
|
3
3
|
#
|
4
4
|
# Class representing a website defined by DSL
|
5
5
|
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
yield(yielder, connector, url)
|
15
|
-
end
|
16
|
-
end
|
6
|
+
module Spidy::Definition
|
7
|
+
def call(name = :default, url: nil, stream: nil, err: nil, &output)
|
8
|
+
name = name.presence || :default
|
9
|
+
spidy = @namespace["#{name}_scraper"]
|
10
|
+
fail "undefined spidy [#{name}]" if spidy.nil?
|
11
|
+
|
12
|
+
exec(spidy, url: url, stream: stream, err: err, &output)
|
13
|
+
end
|
17
14
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def each(name = :default, url: nil, stream: nil, err: nil, &output)
|
16
|
+
name = name.presence || :default
|
17
|
+
spidy = @namespace["#{name}_spider"]
|
18
|
+
fail "undefined spidy [#{name}]" if spidy.nil?
|
19
|
+
|
20
|
+
exec(spidy, url: url, stream: stream, err: err, &output)
|
21
|
+
end
|
22
|
+
|
23
|
+
def spider(name = :default, connector: nil, as: nil)
|
24
|
+
@namespace ||= {}
|
25
|
+
connector = Spidy::Connector.get(as || connector) || connector
|
26
|
+
@namespace["#{name}_spider"] = proc do |url, &yielder|
|
27
|
+
yield(yielder, connector, url)
|
22
28
|
end
|
29
|
+
end
|
23
30
|
|
24
|
-
|
31
|
+
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
32
|
+
@namespace ||= {}
|
33
|
+
connector = Spidy::Connector.get(as || connector) || connector
|
34
|
+
binder = Spidy::Binder.get(as || binder) || binder
|
35
|
+
@namespace["#{name}_scraper"] = define_proc(connector, binder, define_block)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def exec(spidy, url: nil, stream: nil, err: nil, &output)
|
41
|
+
return spidy.call(url, &output) if stream.nil?
|
42
|
+
|
43
|
+
stream.each do |value|
|
44
|
+
spidy.call(value.strip, &output)
|
45
|
+
rescue StandardError => e
|
46
|
+
raise e if err.nil?
|
47
|
+
|
48
|
+
err.call(e, value.strip)
|
49
|
+
end
|
50
|
+
end
|
25
51
|
|
26
|
-
|
27
|
-
|
28
|
-
|
52
|
+
def define_proc(connector, binder, define_block)
|
53
|
+
proc do |url, &yielder|
|
54
|
+
fail 'invalid argument [Required url / block]' if url.blank? && yielder.nil?
|
29
55
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
33
|
-
connector.call(url, &connection_yielder)
|
56
|
+
connection_yielder = lambda do |resource|
|
57
|
+
binder.call(resource, define_block) { |object| yielder.call(object) }
|
34
58
|
end
|
59
|
+
connector.call(url, &connection_yielder)
|
35
60
|
end
|
36
61
|
end
|
37
62
|
end
|
@@ -5,14 +5,7 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::DefinitionFile
|
7
7
|
attr_reader :path
|
8
|
-
attr_reader :
|
9
|
-
delegate :namespace, :spiders, to: :definition
|
10
|
-
|
11
|
-
CSV = lambda do |result|
|
12
|
-
::CSV.generate do |csv|
|
13
|
-
csv << result.definition.attributes_to_array
|
14
|
-
end
|
15
|
-
end
|
8
|
+
attr_reader :spidy
|
16
9
|
|
17
10
|
def self.open(filepath)
|
18
11
|
object = new(filepath)
|
@@ -22,7 +15,7 @@ class Spidy::DefinitionFile
|
|
22
15
|
|
23
16
|
# rubocop:disable Security/Eval
|
24
17
|
def eval_definition
|
25
|
-
@
|
18
|
+
@spidy = eval(File.open(path).read)
|
26
19
|
end
|
27
20
|
# rubocop:enable Security/Eval
|
28
21
|
|
data/lib/spidy/shell.rb
CHANGED
@@ -6,20 +6,25 @@ require 'pry'
|
|
6
6
|
# spidy shell interface
|
7
7
|
#
|
8
8
|
class Spidy::Shell
|
9
|
-
attr_reader :definition_file
|
10
|
-
class_attribute :
|
11
|
-
|
9
|
+
attr_reader :definition_file, :spidy
|
10
|
+
class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
|
11
|
+
class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
|
12
|
+
delegate :spidy, to: :definition_file
|
12
13
|
|
13
14
|
def initialize(definition_file)
|
14
15
|
@definition_file = definition_file
|
15
16
|
end
|
16
17
|
|
17
18
|
def call(name)
|
18
|
-
|
19
|
+
return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
|
20
|
+
|
21
|
+
spidy.call(name, err: error_handler, &output)
|
19
22
|
end
|
20
23
|
|
21
24
|
def each(name)
|
22
|
-
|
25
|
+
return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
|
26
|
+
|
27
|
+
spidy.each(name, err: error_handler, &output)
|
23
28
|
end
|
24
29
|
|
25
30
|
def function
|
@@ -66,17 +71,4 @@ class Spidy::Shell
|
|
66
71
|
RUBY
|
67
72
|
end
|
68
73
|
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def exec(command)
|
73
|
-
fail "undefined commmand[#{name}]" if command.nil?
|
74
|
-
return command.call(&output_yielder) unless FileTest.pipe?(STDIN)
|
75
|
-
|
76
|
-
STDIN.each do |line|
|
77
|
-
command.call(line.strip, &output_yielder)
|
78
|
-
rescue StandardError => e
|
79
|
-
STDERR.puts("#{line.strip}\n #{e.message}")
|
80
|
-
end
|
81
|
-
end
|
82
74
|
end
|
data/lib/spidy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|