daimon_skycrawlers 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2fd637d109812fe657a771536f3c3a0041e4868a
4
- data.tar.gz: 031bf50b2b72e6320ee748cb9044e177f24f5d2f
3
+ metadata.gz: 58dd7f91f6e9da8f9388a3364731ab0a543c01cd
4
+ data.tar.gz: b815a1cdad154eaf1b828568a697887df02bfcb6
5
5
  SHA512:
6
- metadata.gz: 59f6404ea231ecc337b1658406daa8c1e3b0d557ddc3d46c37efa91b1f7d3e0f8018d524aa3c2cc227fbee57d04ef51d7561ae879fd5d0fc719f4fca1308afb3
7
- data.tar.gz: eeb06f74ae722a6cdf18d00f0aeda3e06e06199e02ddd2f0e52fedb3c7a7028c4a99e46a8c19d2a4e6aa0c3979f0602757d5598a5e1de39112457d1b5dce5679
6
+ metadata.gz: 1ffbdd022a6e6a3a80d292bae8927a716f33f339e19e12f67da536812f9118615be0d8886fc143b529b7854f3b4ef0731962f3102aa6c759790d86dd2df54914
7
+ data.tar.gz: 1beb4753e8602224c95081651a4da6276659bfe132cdb9dc5fe55e8ae9f2c524681bd9be8137d4e29d52380d0f137ea2630097337bc07aea31de55e3f2785d3c
@@ -3,8 +3,44 @@ require "daimon_skycrawlers/crawler"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Processor
6
+ #
7
+ # Web spider class.
8
+ # By default extract all links and follow.
9
+ #
10
+ # @example Google search result (2016-11-29)
11
+ # spider = DaimonSkycrawlers::Processor::Spider.new
12
+ # spider.configure do |s|
13
+ # s.link_rules = ".g .r a"
14
+ # s.extract_link do |element|
15
+ # element["data-href"]
16
+ # end
17
+ # s.link_message = { next: "detail" }
18
+ # s.next_page_link_rules = "a#pnnext"
19
+ # s.next_page_link_message = { next: "spider" }
20
+ # end
21
+ #
6
22
  class Spider < Base
7
- attr_accessor :enqueue
23
+ # @!attribute [rw] enqueue
24
+ # If true enqueue found links
25
+ #
26
+ # @!attribute [rw] link_rules
27
+ # same as Nokogiri::XML::DocumentFragment#search
28
+ # In generally, we can set XPath or CSS selector.
29
+ #
30
+ # @!attribute [rw] next_page_link_rules
31
+ # same as Nokogiri::XML::DocumentFragment#search
32
+ # In generally, we can set XPath or CSS selector.
33
+ #
34
+ attr_accessor :enqueue, :link_rules, :next_page_link_rules
35
+
36
+ # @!attribute [w] link_message
37
+ # Specify hash literal to propagate arbitrary data next crawler/processor.
38
+ # This is for filtering message before crawler/processor processes the message.
39
+ #
40
+ # @!attribute [w] next_page_link_message
41
+ # Specify hash literal to propagate arbitrary data next crawler/processor.
42
+ # This is for filtering message before crawler/processor processes the message.
43
+ attr_writer :link_message, :next_page_link_message
8
44
 
9
45
  def initialize
10
46
  super
@@ -12,8 +48,31 @@ module DaimonSkycrawlers
12
48
  @doc = nil
13
49
  @links = nil
14
50
  @enqueue = true
51
+ @link_rules = ["a"]
52
+ @extract_link = ->(element) { element["href"] }
53
+ @link_message = {}
54
+ @next_page_link_rules = nil
55
+ @extract_next_page_link = ->(element) { element["href"] }
56
+ @next_page_link_message = {}
15
57
  end
16
58
 
59
+ #
60
+ # Configure spider instance
61
+ #
62
+ # @return [DaimonSkycrawlers::Processor::Spider] self
63
+ #
64
+ def configure
65
+ yield self
66
+ self
67
+ end
68
+
69
+ #
70
+ # Append filter to reduce links found by link_rules
71
+ #
72
+ # @param filter [Object] Filter object that has call method
73
+ # @yield [message] Similar to Array#select
74
+ # @yieldparam message [Hash]
75
+ #
17
76
  def append_link_filter(filter = nil, &block)
18
77
  if block_given?
19
78
  @link_filters << block
@@ -22,6 +81,30 @@ module DaimonSkycrawlers
22
81
  end
23
82
  end
24
83
 
84
+ #
85
+ # Register block to process element found by DaimonSkycrawlers::Processor::Spider#link_rules
86
+ #
87
+ # @yield [element]
88
+ # @yieldparam element [Object]
89
+ # @example Default
90
+ # ->(element) { element["href"] }
91
+ #
92
+ def extract_link(&block)
93
+ @extract_link = block
94
+ end
95
+
96
+ #
97
+ # Register block to process element found by DaimonSkycrawlers::Processor::Spider#next_page_link_rules
98
+ #
99
+ # @yield [element]
100
+ # @yieldparam element [Object]
101
+ # @example Default
102
+ # ->(element) { element["href"] }
103
+ #
104
+ def extract_next_page_link(&block)
105
+ @extract_next_page_link = block
106
+ end
107
+
25
108
  #
26
109
  # @param [Hash] message Must have key :url, :depth
27
110
  #
@@ -35,8 +118,14 @@ module DaimonSkycrawlers
35
118
  new_message = {
36
119
  depth: depth - 1,
37
120
  }
121
+ link_message = new_message.merge(@link_message)
38
122
  links.each do |url|
39
- enqueue_url(url, new_message)
123
+ enqueue_url(url, link_message)
124
+ end
125
+ next_page_url = find_next_page_link
126
+ if next_page_link
127
+ next_page_link_message = new_message.merge(@next_page_link_message)
128
+ enqueue_url(next_page_url, next_page_link_message)
40
129
  end
41
130
  end
42
131
 
@@ -49,13 +138,20 @@ module DaimonSkycrawlers
49
138
  end
50
139
 
51
140
  def retrieve_links
52
- urls = @doc.search("a").map do |element|
53
- element["href"]
141
+ urls = @doc.search(*link_rules).map do |element|
142
+ @extract_next_page_link.call(element)
54
143
  end
55
144
  urls.uniq!
56
145
  apply_link_filters(urls) || []
57
146
  end
58
147
 
148
+ def next_page_link
149
+ return unless next_page_link_rules
150
+ element = @doc.at(*next_page_link_rules)
151
+ return unless element
152
+ @extract_next_page_link.call(element)
153
+ end
154
+
59
155
  def apply_link_filters(urls)
60
156
  return if urls.nil?
61
157
  return if urls.empty?
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.9.0"
2
+ VERSION = "0.10.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-24 00:00:00.000000000 Z
11
+ date: 2016-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -407,7 +407,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
407
407
  version: '0'
408
408
  requirements: []
409
409
  rubyforge_project:
410
- rubygems_version: 2.6.4
410
+ rubygems_version: 2.5.1
411
411
  signing_key:
412
412
  specification_version: 4
413
413
  summary: This is a crawler framework.