spidy 0.0.24 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11b932ab32a92fe7cca2a4c7fb91204098204d2b60d3226137fc41c783f50997
4
- data.tar.gz: 34fcddd2519415792d0222bf0c8176dd06a734cc48db3025ccbadd6d10a80582
3
+ metadata.gz: 2418b53cec63dbd0cebe2dd7b0d10e9be1ea1177cd4be3584d3f04e2adbf6c10
4
+ data.tar.gz: bc8b78146e0f1f08b9b6fb052093ba92ca03aecf546a8ca9b079eab5991b8acf
5
5
  SHA512:
6
- metadata.gz: 64fe7aa6a460a4c13d90749827f85904c1cf8ca14391cc2e12f22e2ec4b5b8a3fd7776ad282e178cadddaa6db197a83f49b5870b27aaa8356b79d20565992383
7
- data.tar.gz: a8405babf239188f933a13b5213bbc67efe5610a7e7f68bfa49c646a755ecebba049cdab4dbe6b52b085c71f6a0ffd82609ea5aadf9ef355bd8bf6200bdfe5f0
6
+ metadata.gz: fb04e182f956b07c7bfc2290dabacdca588a940ba7aacfea1b417be94e7332bb5d9793e698150455b7fd484c2e745015c3cd4efe66f697f92710275d322376f1
7
+ data.tar.gz: 5cef25c2ca15ace3137cedb5181da48a95c67390304a4ab28c92f57de05801c4492296578e275cbcea51a597fa501a0ef661c20f4c0e67075dc492be57e558fa
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.2
1
+ 2.7.0
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.0.21)
4
+ spidy (0.0.24)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -21,6 +21,7 @@ GEM
21
21
  diff-lcs (1.3)
22
22
  domain_name (0.5.20190701)
23
23
  unf (>= 0.0.5, < 1.0.0)
24
+ ffaker (2.10.0)
24
25
  http-cookie (1.0.3)
25
26
  domain_name (~> 0.5)
26
27
  i18n (1.7.0)
@@ -77,6 +78,7 @@ PLATFORMS
77
78
 
78
79
  DEPENDENCIES
79
80
  bundler (~> 2.0)
81
+ ffaker
80
82
  pry
81
83
  rake (~> 10.0)
82
84
  rspec (~> 3.0)
@@ -5,11 +5,11 @@
5
5
  #
6
6
  class Spidy::Binder::Html
7
7
  class << self
8
- attr_reader :names
8
+ attr_reader :attribute_names
9
9
 
10
- @names = []
11
10
  def let(name, query = nil, &block)
12
- @names << name
11
+ @attribute_names ||= []
12
+ @attribute_names << name
13
13
  define_method(name) do
14
14
  return html.at(query)&.text if block.nil?
15
15
  return instance_exec(&block) if query.blank?
@@ -21,12 +21,11 @@ class Spidy::Binder::Html
21
21
  end
22
22
  end
23
23
 
24
- attr_reader :html, :source, :url
24
+ attr_reader :html, :url
25
25
 
26
26
  def initialize(html, url: nil)
27
- @html = html
28
27
  @url = url
29
- @source = html.body
28
+ @html = html
30
29
  end
31
30
 
32
31
  def to_s
@@ -34,6 +33,6 @@ class Spidy::Binder::Html
34
33
  end
35
34
 
36
35
  def to_h
37
- self.class.names.map { |name| [name, send(name)] }.to_h
36
+ self.class.attribute_names.map { |name| [name, send(name)] }.to_h
38
37
  end
39
38
  end
@@ -20,12 +20,11 @@ class Spidy::Binder::Json
20
20
  end
21
21
  end
22
22
 
23
- attr_reader :json, :source, :url
23
+ attr_reader :json, :url
24
24
 
25
25
  def initialize(json, url: nil)
26
26
  @json = json
27
27
  @url = url
28
- @source = json.to_json
29
28
  end
30
29
 
31
30
  def to_s
@@ -21,12 +21,11 @@ class Spidy::Binder::Xml
21
21
  end
22
22
  end
23
23
 
24
- attr_reader :xml, :source, :url
24
+ attr_reader :xml, :url
25
25
 
26
26
  def initialize(xml, url: nil)
27
27
  @xml = xml
28
28
  @url = url
29
- @source = xml.to_s
30
29
  end
31
30
 
32
31
  def to_s
@@ -5,10 +5,20 @@
5
5
  #
6
6
  module Spidy::Connector
7
7
  extend ActiveSupport::Autoload
8
+ autoload :Direct
8
9
  autoload :Html
9
10
  autoload :Json
10
11
  autoload :Xml
11
12
 
13
+ USER_AGENT = [
14
+ 'Mozilla/5.0',
15
+ '(Macintosh; Intel Mac OS X 10_12_6)',
16
+ 'AppleWebKit/537.36',
17
+ '(KHTML, like Gecko)',
18
+ 'Chrome/64.0.3282.186',
19
+ 'Safari/537.36'
20
+ ].join(' ')
21
+
12
22
  def self.get(value)
13
23
  return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
14
24
 
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Direct resource ( not network resource )
5
+ #
6
+ module Spidy::Connector::Direct
7
+ def self.call(resource, &yielder)
8
+ yielder.call(resource)
9
+ end
10
+ end
@@ -18,18 +18,10 @@ module Spidy::Connector::Html
18
18
  @response_code = error.try(:response_code) || page.try(:response_code)
19
19
  end
20
20
  end
21
- USER_AGENT = [
22
- 'Mozilla/5.0',
23
- '(Macintosh; Intel Mac OS X 10_12_6)',
24
- 'AppleWebKit/537.36',
25
- '(KHTML, like Gecko)',
26
- 'Chrome/64.0.3282.186',
27
- 'Safari/537.36'
28
- ].join(' ')
29
21
 
30
- @logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
22
+ @logger = proc { |values| STDERR.puts(values.to_json) }
31
23
  @agent = Mechanize.new
32
- @agent.user_agent = USER_AGENT
24
+ @agent.user_agent = Spidy::Connector::USER_AGENT
33
25
 
34
26
  class << self
35
27
  attr_reader :agent
@@ -56,7 +48,7 @@ module Spidy::Connector::Html
56
48
  'retry.rest_count': retry_count)
57
49
 
58
50
  @agent = Mechanize.new
59
- @agent.user_agent = USER_AGENT
51
+ @agent.user_agent = Spidy::Connector::USER_AGENT
60
52
 
61
53
  retry_count -= 1
62
54
  if retry_count.positive?
@@ -4,8 +4,8 @@
4
4
  # OpenURI to JSON.parse
5
5
  #
6
6
  module Spidy::Connector::Json
7
- def self.call(url, &yielder)
7
+ def self.call(url)
8
8
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url) { |body| yielder.call(JSON.parse(body.read, symbolize_names: true)) }
9
+ OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
10
10
  end
11
11
  end
@@ -7,7 +7,8 @@ module Spidy::Connector::Xml
7
7
  def self.call(url)
8
8
  fail 'URL is undefined' if url.blank?
9
9
 
10
- xml = OpenURI.open_uri(url).read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, '')
11
- yield Nokogiri::XML(xml, url)
10
+ OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
11
+ yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
+ end
12
13
  end
13
14
  end
data/lib/spidy/console.rb CHANGED
@@ -19,11 +19,11 @@ class Spidy::Console
19
19
  @definition_file&.eval_definition
20
20
  end
21
21
 
22
- def call(name = nil, url: nil, &block)
23
- spidy.call(name, url: url, &block)
22
+ def call(name = :default, url: nil, &block)
23
+ spidy.call(name: name, url: url, &block)
24
24
  end
25
25
 
26
- def each(name = nil, url: nil, &block)
27
- spidy.each(name, url: url, &block)
26
+ def each(name = :default, url: nil, &block)
27
+ spidy.each(name: name, url: url, &block)
28
28
  end
29
29
  end
@@ -8,27 +8,27 @@ module Spidy::Definition
8
8
  @namespace
9
9
  end
10
10
 
11
- def call(name = :default, url: nil, stream: nil, err: nil, &output)
11
+ def call(source = nil, name: :default, &yielder)
12
12
  name = name.presence || :default
13
13
  spidy = @namespace[:"#{name}_scraper"]
14
14
  fail "undefined spidy [#{name}]" if spidy.nil?
15
15
 
16
- exec(spidy, url: url, stream: stream, err: err, &output)
16
+ spidy.call(source, &yielder)
17
17
  end
18
18
 
19
- def each(name = :default, url: nil, stream: nil, err: nil, &output)
19
+ def each(source = nil, name: :default, &yielder)
20
20
  name = name.presence || :default
21
21
  spidy = @namespace[:"#{name}_spider"]
22
22
  fail "undefined spidy [#{name}]" if spidy.nil?
23
23
 
24
- exec(spidy, url: url, stream: stream, err: err, &output)
24
+ spidy.call(source, &yielder)
25
25
  end
26
26
 
27
- def spider(name = :default, connector: nil, as: nil)
27
+ def spider(name = :default, connector: nil, as: nil, &define_block)
28
28
  @namespace ||= {}
29
- connector = Spidy::Connector.get(as || connector) || connector
30
- @namespace[:"#{name}_spider"] = proc do |url, &yielder|
31
- yield(yielder, connector, url)
29
+ connector = Spidy::Connector.get(connector || as)
30
+ @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
+ define_block.call(yielder, connector, source)
32
32
  end
33
33
  end
34
34
 
@@ -41,26 +41,14 @@ module Spidy::Definition
41
41
 
42
42
  private
43
43
 
44
- def exec(spidy, url: nil, stream: nil, err: nil, &output)
45
- return spidy.call(url, &output) if stream.nil?
46
-
47
- stream.each do |value|
48
- spidy.call(value.strip, &output)
49
- rescue StandardError => e
50
- raise e if err.nil?
51
-
52
- err.call(e, value.strip)
53
- end
54
- end
55
-
56
44
  def define_proc(connector, binder, define_block)
57
- proc do |url, &yielder|
45
+ proc do |source, &yielder|
58
46
  fail 'block is not specified' if yielder.nil?
59
47
 
60
- connection_yielder = lambda do |resource|
61
- binder.call(resource, url: url, define: define_block) { |object| yielder.call(object) }
48
+ connection_yielder = lambda do |page|
49
+ binder.call(page, url: source, define: define_block) { |object| yielder.call(object) }
62
50
  end
63
- connector.call(url, &connection_yielder)
51
+ connector.call(source, &connection_yielder)
64
52
  end
65
53
  end
66
54
  end
data/lib/spidy/shell.rb CHANGED
@@ -8,23 +8,45 @@ require 'pry'
8
8
  class Spidy::Shell
9
9
  attr_reader :definition_file, :spidy
10
10
  class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
11
- class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
11
+ class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
12
12
  delegate :spidy, to: :definition_file
13
13
 
14
14
  def initialize(definition_file)
15
15
  @definition_file = definition_file
16
16
  end
17
17
 
18
- def call(name)
19
- return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
18
+ def each_stdin_lines(name)
19
+ STDIN.each_line do |url|
20
+ begin
21
+ spidy.each(url.strip, name: name, &output)
22
+ rescue => e
23
+ error_handler.call(e, url)
24
+ end
25
+ end
26
+ end
27
+
28
+ def call_stdin_lines(name)
29
+ STDIN.each_line do |url|
30
+ begin
31
+ spidy.call(url.strip, name: name, &output)
32
+ rescue => e
33
+ error_handler.call(e, url)
34
+ end
35
+ end
36
+ end
20
37
 
21
- spidy.call(name, err: error_handler, &output)
38
+ def call(name)
39
+ return call_stdin_lines(name) if FileTest.pipe?(STDIN)
40
+ spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
41
+ rescue => e
42
+ error_handler.call(e, nil)
22
43
  end
23
44
 
24
45
  def each(name)
25
- return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
26
-
27
- spidy.each(name, err: error_handler, &output)
46
+ return each_stdin_lines(name) if FileTest.pipe?(STDIN)
47
+ spidy.each(name: name, &output)
48
+ rescue => e
49
+ error_handler.call(e, nil)
28
50
  end
29
51
 
30
52
  def function
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.0.24'
4
+ VERSION = '0.1.0'
5
5
  end
data/spidy.gemspec CHANGED
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency 'pry'
29
29
  spec.add_development_dependency 'rake', '~> 10.0'
30
30
  spec.add_development_dependency 'rspec', '~> 3.0'
31
+ spec.add_development_dependency 'ffaker'
31
32
 
32
33
  spec.add_runtime_dependency 'activesupport'
33
34
  spec.add_runtime_dependency 'mechanize'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.24
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-26 00:00:00.000000000 Z
11
+ date: 2020-01-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ffaker
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: activesupport
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -138,6 +152,7 @@ files:
138
152
  - lib/spidy/binder/json.rb
139
153
  - lib/spidy/binder/xml.rb
140
154
  - lib/spidy/connector.rb
155
+ - lib/spidy/connector/direct.rb
141
156
  - lib/spidy/connector/html.rb
142
157
  - lib/spidy/connector/json.rb
143
158
  - lib/spidy/connector/xml.rb
@@ -168,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
183
  - !ruby/object:Gem::Version
169
184
  version: '0'
170
185
  requirements: []
171
- rubygems_version: 3.0.3
186
+ rubygems_version: 3.1.2
172
187
  signing_key:
173
188
  specification_version: 4
174
189
  summary: web spider dsl