daimon_skycrawlers 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2fd637d109812fe657a771536f3c3a0041e4868a
4
- data.tar.gz: 031bf50b2b72e6320ee748cb9044e177f24f5d2f
3
+ metadata.gz: 58dd7f91f6e9da8f9388a3364731ab0a543c01cd
4
+ data.tar.gz: b815a1cdad154eaf1b828568a697887df02bfcb6
5
5
  SHA512:
6
- metadata.gz: 59f6404ea231ecc337b1658406daa8c1e3b0d557ddc3d46c37efa91b1f7d3e0f8018d524aa3c2cc227fbee57d04ef51d7561ae879fd5d0fc719f4fca1308afb3
7
- data.tar.gz: eeb06f74ae722a6cdf18d00f0aeda3e06e06199e02ddd2f0e52fedb3c7a7028c4a99e46a8c19d2a4e6aa0c3979f0602757d5598a5e1de39112457d1b5dce5679
6
+ metadata.gz: 1ffbdd022a6e6a3a80d292bae8927a716f33f339e19e12f67da536812f9118615be0d8886fc143b529b7854f3b4ef0731962f3102aa6c759790d86dd2df54914
7
+ data.tar.gz: 1beb4753e8602224c95081651a4da6276659bfe132cdb9dc5fe55e8ae9f2c524681bd9be8137d4e29d52380d0f137ea2630097337bc07aea31de55e3f2785d3c
@@ -3,8 +3,44 @@ require "daimon_skycrawlers/crawler"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Processor
6
+ #
7
+ # Web spider class.
8
+ # By default extract all links and follow.
9
+ #
10
+ # @example Google search result (2016-11-29)
11
+ # spider = DaimonSkycrawlers::Processor::Spider.new
12
+ # spider.configure do |s|
13
+ # s.link_rules = ".g .r a"
14
+ # s.extract_link do |element|
15
+ # element["data-href"]
16
+ # end
17
+ # s.link_message = { next: "detail" }
18
+ # s.next_page_link_rules = "a#pnnext"
19
+ # s.next_page_link_message = { next: "spider" }
20
+ # end
21
+ #
6
22
  class Spider < Base
7
- attr_accessor :enqueue
23
+ # @!attribute [rw] enqueue
24
+ # If true enqueue found links
25
+ #
26
+ # @!attribute [rw] link_rules
27
+ # same as Nokogiri::XML::DocumentFragment#search
28
+ # In generally, we can set XPath or CSS selector.
29
+ #
30
+ # @!attribute [rw] next_page_link_rules
31
+ # same as Nokogiri::XML::DocumentFragment#search
32
+ # In generally, we can set XPath or CSS selector.
33
+ #
34
+ attr_accessor :enqueue, :link_rules, :next_page_link_rules
35
+
36
+ # @!attribute [w] link_message
37
+ # Specify hash literal to propagate arbitrary data next crawler/processor.
38
+ # This is for filtering message before crawler/processor processes the message.
39
+ #
40
+ # @!attribute [w] next_page_link_message
41
+ # Specify hash literal to propagate arbitrary data next crawler/processor.
42
+ # This is for filtering message before crawler/processor processes the message.
43
+ attr_writer :link_message, :next_page_link_message
8
44
 
9
45
  def initialize
10
46
  super
@@ -12,8 +48,31 @@ module DaimonSkycrawlers
12
48
  @doc = nil
13
49
  @links = nil
14
50
  @enqueue = true
51
+ @link_rules = ["a"]
52
+ @extract_link = ->(element) { element["href"] }
53
+ @link_message = {}
54
+ @next_page_link_rules = nil
55
+ @extract_next_page_link = ->(element) { element["href"] }
56
+ @next_page_link_message = {}
15
57
  end
16
58
 
59
+ #
60
+ # Configure spider instance
61
+ #
62
+ # @return [DaimonSkycrawlers::Processor::Spider] self
63
+ #
64
+ def configure
65
+ yield self
66
+ self
67
+ end
68
+
69
+ #
70
+ # Append filter to reduce links found by link_rules
71
+ #
72
+ # @param filter [Object] Filter object that has call method
73
+ # @yield [message] Similar to Array#select
74
+ # @yieldparam message [Hash]
75
+ #
17
76
  def append_link_filter(filter = nil, &block)
18
77
  if block_given?
19
78
  @link_filters << block
@@ -22,6 +81,30 @@ module DaimonSkycrawlers
22
81
  end
23
82
  end
24
83
 
84
+ #
85
+ # Register block to process element found by DaimonSkycrawlers::Processor::Spider#link_rules
86
+ #
87
+ # @yield [element]
88
+ # @yieldparam element [Object]
89
+ # @example Default
90
+ # ->(element) { element["href"] }
91
+ #
92
+ def extract_link(&block)
93
+ @extract_link = block
94
+ end
95
+
96
+ #
97
+ # Register block to process element found by DaimonSkycrawlers::Processor::Spider#next_page_link_rules
98
+ #
99
+ # @yield [element]
100
+ # @yieldparam element [Object]
101
+ # @example Default
102
+ # ->(element) { element["href"] }
103
+ #
104
+ def extract_next_page_link(&block)
105
+ @extract_next_page_link = block
106
+ end
107
+
25
108
  #
26
109
  # @param [Hash] message Must have key :url, :depth
27
110
  #
@@ -35,8 +118,14 @@ module DaimonSkycrawlers
35
118
  new_message = {
36
119
  depth: depth - 1,
37
120
  }
121
+ link_message = new_message.merge(@link_message)
38
122
  links.each do |url|
39
- enqueue_url(url, new_message)
123
+ enqueue_url(url, link_message)
124
+ end
125
+ next_page_url = find_next_page_link
126
+ if next_page_link
127
+ next_page_link_message = new_message.merge(@next_page_link_message)
128
+ enqueue_url(next_page_url, next_page_link_message)
40
129
  end
41
130
  end
42
131
 
@@ -49,13 +138,20 @@ module DaimonSkycrawlers
49
138
  end
50
139
 
51
140
  def retrieve_links
52
- urls = @doc.search("a").map do |element|
53
- element["href"]
141
+ urls = @doc.search(*link_rules).map do |element|
142
+ @extract_next_page_link.call(element)
54
143
  end
55
144
  urls.uniq!
56
145
  apply_link_filters(urls) || []
57
146
  end
58
147
 
148
+ def next_page_link
149
+ return unless next_page_link_rules
150
+ element = @doc.at(*next_page_link_rules)
151
+ return unless element
152
+ @extract_next_page_link.call(element)
153
+ end
154
+
59
155
  def apply_link_filters(urls)
60
156
  return if urls.nil?
61
157
  return if urls.empty?
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.9.0"
2
+ VERSION = "0.10.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-24 00:00:00.000000000 Z
11
+ date: 2016-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -407,7 +407,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
407
407
  version: '0'
408
408
  requirements: []
409
409
  rubyforge_project:
410
- rubygems_version: 2.6.4
410
+ rubygems_version: 2.5.1
411
411
  signing_key:
412
412
  specification_version: 4
413
413
  summary: This is a crawler framework.