daimon_skycrawlers 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/processor/spider.rb +100 -4
- data/lib/daimon_skycrawlers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58dd7f91f6e9da8f9388a3364731ab0a543c01cd
|
4
|
+
data.tar.gz: b815a1cdad154eaf1b828568a697887df02bfcb6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ffbdd022a6e6a3a80d292bae8927a716f33f339e19e12f67da536812f9118615be0d8886fc143b529b7854f3b4ef0731962f3102aa6c759790d86dd2df54914
|
7
|
+
data.tar.gz: 1beb4753e8602224c95081651a4da6276659bfe132cdb9dc5fe55e8ae9f2c524681bd9be8137d4e29d52380d0f137ea2630097337bc07aea31de55e3f2785d3c
|
@@ -3,8 +3,44 @@ require "daimon_skycrawlers/crawler"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Processor
|
6
|
+
#
|
7
|
+
# Web spider class.
|
8
|
+
# By default extract all links and follow.
|
9
|
+
#
|
10
|
+
# @example Google search result (2016-11-29)
|
11
|
+
# spider = DaimonSkycrawlers::Processor::Spider.new
|
12
|
+
# spider.configure do |s|
|
13
|
+
# s.link_rules = ".g .r a"
|
14
|
+
# s.extract_link do |element|
|
15
|
+
# element["data-href"]
|
16
|
+
# end
|
17
|
+
# s.link_message = { next: "detail" }
|
18
|
+
# s.next_page_link_rules = "a#pnnext"
|
19
|
+
# s.next_page_link_message = { next: "spider" }
|
20
|
+
# end
|
21
|
+
#
|
6
22
|
class Spider < Base
|
7
|
-
|
23
|
+
# @!attribute [rw] enqueue
|
24
|
+
# If true enqueue found links
|
25
|
+
#
|
26
|
+
# @!attribute [rw] link_rules
|
27
|
+
# same as Nokogiri::XML::DocumentFragment#search
|
28
|
+
# In generally, we can set XPath or CSS selector.
|
29
|
+
#
|
30
|
+
# @!attribute [rw] next_page_link_rules
|
31
|
+
# same as Nokogiri::XML::DocumentFragment#search
|
32
|
+
# In generally, we can set XPath or CSS selector.
|
33
|
+
#
|
34
|
+
attr_accessor :enqueue, :link_rules, :next_page_link_rules
|
35
|
+
|
36
|
+
# @!attribute [w] link_message
|
37
|
+
# Specify hash literal to propagate arbitrary data next crawler/processor.
|
38
|
+
# This is for filtering message before crawler/processor processes the message.
|
39
|
+
#
|
40
|
+
# @!attribute [w] next_page_link_message
|
41
|
+
# Specify hash literal to propagate arbitrary data next crawler/processor.
|
42
|
+
# This is for filtering message before crawler/processor processes the message.
|
43
|
+
attr_writer :link_message, :next_page_link_message
|
8
44
|
|
9
45
|
def initialize
|
10
46
|
super
|
@@ -12,8 +48,31 @@ module DaimonSkycrawlers
|
|
12
48
|
@doc = nil
|
13
49
|
@links = nil
|
14
50
|
@enqueue = true
|
51
|
+
@link_rules = ["a"]
|
52
|
+
@extract_link = ->(element) { element["href"] }
|
53
|
+
@link_message = {}
|
54
|
+
@next_page_link_rules = nil
|
55
|
+
@extract_next_page_link = ->(element) { element["href"] }
|
56
|
+
@next_page_link_message = {}
|
15
57
|
end
|
16
58
|
|
59
|
+
#
|
60
|
+
# Configure spider instance
|
61
|
+
#
|
62
|
+
# @return [DaimonSkycrawlers::Processor::Spider] self
|
63
|
+
#
|
64
|
+
def configure
|
65
|
+
yield self
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Append filter to reduce links found by link_rules
|
71
|
+
#
|
72
|
+
# @param filter [Object] Filter object that has call method
|
73
|
+
# @yield [message] Similar to Array#select
|
74
|
+
# @yieldparam message [Hash]
|
75
|
+
#
|
17
76
|
def append_link_filter(filter = nil, &block)
|
18
77
|
if block_given?
|
19
78
|
@link_filters << block
|
@@ -22,6 +81,30 @@ module DaimonSkycrawlers
|
|
22
81
|
end
|
23
82
|
end
|
24
83
|
|
84
|
+
#
|
85
|
+
# Register block to process element found by DaimonSkycrawlers::Processor::Spider#link_rules
|
86
|
+
#
|
87
|
+
# @yield [element]
|
88
|
+
# @yieldparam element [Object]
|
89
|
+
# @example Default
|
90
|
+
# ->(element) { element["href"] }
|
91
|
+
#
|
92
|
+
def extract_link(&block)
|
93
|
+
@extract_link = block
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Register block to process element found by DaimonSkycrawlers::Processor::Spider#next_page_link_rules
|
98
|
+
#
|
99
|
+
# @yield [element]
|
100
|
+
# @yieldparam element [Object]
|
101
|
+
# @example Default
|
102
|
+
# ->(element) { element["href"] }
|
103
|
+
#
|
104
|
+
def extract_next_page_link(&block)
|
105
|
+
@extract_next_page_link = block
|
106
|
+
end
|
107
|
+
|
25
108
|
#
|
26
109
|
# @param [Hash] message Must have key :url, :depth
|
27
110
|
#
|
@@ -35,8 +118,14 @@ module DaimonSkycrawlers
|
|
35
118
|
new_message = {
|
36
119
|
depth: depth - 1,
|
37
120
|
}
|
121
|
+
link_message = new_message.merge(@link_message)
|
38
122
|
links.each do |url|
|
39
|
-
enqueue_url(url,
|
123
|
+
enqueue_url(url, link_message)
|
124
|
+
end
|
125
|
+
next_page_url = find_next_page_link
|
126
|
+
if next_page_link
|
127
|
+
next_page_link_message = new_message.merge(@next_page_link_message)
|
128
|
+
enqueue_url(next_page_url, next_page_link_message)
|
40
129
|
end
|
41
130
|
end
|
42
131
|
|
@@ -49,13 +138,20 @@ module DaimonSkycrawlers
|
|
49
138
|
end
|
50
139
|
|
51
140
|
def retrieve_links
|
52
|
-
urls = @doc.search(
|
53
|
-
element
|
141
|
+
urls = @doc.search(*link_rules).map do |element|
|
142
|
+
@extract_next_page_link.call(element)
|
54
143
|
end
|
55
144
|
urls.uniq!
|
56
145
|
apply_link_filters(urls) || []
|
57
146
|
end
|
58
147
|
|
148
|
+
def next_page_link
|
149
|
+
return unless next_page_link_rules
|
150
|
+
element = @doc.at(*next_page_link_rules)
|
151
|
+
return unless element
|
152
|
+
@extract_next_page_link.call(element)
|
153
|
+
end
|
154
|
+
|
59
155
|
def apply_link_filters(urls)
|
60
156
|
return if urls.nil?
|
61
157
|
return if urls.empty?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -407,7 +407,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
407
407
|
version: '0'
|
408
408
|
requirements: []
|
409
409
|
rubyforge_project:
|
410
|
-
rubygems_version: 2.
|
410
|
+
rubygems_version: 2.5.1
|
411
411
|
signing_key:
|
412
412
|
specification_version: 4
|
413
413
|
summary: This is a crawler framework.
|