daimon_skycrawlers 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/processor/spider.rb +100 -4
- data/lib/daimon_skycrawlers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 58dd7f91f6e9da8f9388a3364731ab0a543c01cd
|
|
4
|
+
data.tar.gz: b815a1cdad154eaf1b828568a697887df02bfcb6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1ffbdd022a6e6a3a80d292bae8927a716f33f339e19e12f67da536812f9118615be0d8886fc143b529b7854f3b4ef0731962f3102aa6c759790d86dd2df54914
|
|
7
|
+
data.tar.gz: 1beb4753e8602224c95081651a4da6276659bfe132cdb9dc5fe55e8ae9f2c524681bd9be8137d4e29d52380d0f137ea2630097337bc07aea31de55e3f2785d3c
|
|
@@ -3,8 +3,44 @@ require "daimon_skycrawlers/crawler"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Processor
|
|
6
|
+
#
|
|
7
|
+
# Web spider class.
|
|
8
|
+
# By default extract all links and follow.
|
|
9
|
+
#
|
|
10
|
+
# @example Google search result (2016-11-29)
|
|
11
|
+
# spider = DaimonSkycrawlers::Processor::Spider.new
|
|
12
|
+
# spider.configure do |s|
|
|
13
|
+
# s.link_rules = ".g .r a"
|
|
14
|
+
# s.extract_link do |element|
|
|
15
|
+
# element["data-href"]
|
|
16
|
+
# end
|
|
17
|
+
# s.link_message = { next: "detail" }
|
|
18
|
+
# s.next_page_link_rules = "a#pnnext"
|
|
19
|
+
# s.next_page_link_message = { next: "spider" }
|
|
20
|
+
# end
|
|
21
|
+
#
|
|
6
22
|
class Spider < Base
|
|
7
|
-
|
|
23
|
+
# @!attribute [rw] enqueue
|
|
24
|
+
# If true enqueue found links
|
|
25
|
+
#
|
|
26
|
+
# @!attribute [rw] link_rules
|
|
27
|
+
# same as Nokogiri::XML::DocumentFragment#search
|
|
28
|
+
# In generally, we can set XPath or CSS selector.
|
|
29
|
+
#
|
|
30
|
+
# @!attribute [rw] next_page_link_rules
|
|
31
|
+
# same as Nokogiri::XML::DocumentFragment#search
|
|
32
|
+
# In generally, we can set XPath or CSS selector.
|
|
33
|
+
#
|
|
34
|
+
attr_accessor :enqueue, :link_rules, :next_page_link_rules
|
|
35
|
+
|
|
36
|
+
# @!attribute [w] link_message
|
|
37
|
+
# Specify hash literal to propagate arbitrary data next crawler/processor.
|
|
38
|
+
# This is for filtering message before crawler/processor processes the message.
|
|
39
|
+
#
|
|
40
|
+
# @!attribute [w] next_page_link_message
|
|
41
|
+
# Specify hash literal to propagate arbitrary data next crawler/processor.
|
|
42
|
+
# This is for filtering message before crawler/processor processes the message.
|
|
43
|
+
attr_writer :link_message, :next_page_link_message
|
|
8
44
|
|
|
9
45
|
def initialize
|
|
10
46
|
super
|
|
@@ -12,8 +48,31 @@ module DaimonSkycrawlers
|
|
|
12
48
|
@doc = nil
|
|
13
49
|
@links = nil
|
|
14
50
|
@enqueue = true
|
|
51
|
+
@link_rules = ["a"]
|
|
52
|
+
@extract_link = ->(element) { element["href"] }
|
|
53
|
+
@link_message = {}
|
|
54
|
+
@next_page_link_rules = nil
|
|
55
|
+
@extract_next_page_link = ->(element) { element["href"] }
|
|
56
|
+
@next_page_link_message = {}
|
|
15
57
|
end
|
|
16
58
|
|
|
59
|
+
#
|
|
60
|
+
# Configure spider instance
|
|
61
|
+
#
|
|
62
|
+
# @return [DaimonSkycrawlers::Processor::Spider] self
|
|
63
|
+
#
|
|
64
|
+
def configure
|
|
65
|
+
yield self
|
|
66
|
+
self
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
#
|
|
70
|
+
# Append filter to reduce links found by link_rules
|
|
71
|
+
#
|
|
72
|
+
# @param filter [Object] Filter object that has call method
|
|
73
|
+
# @yield [message] Similar to Array#select
|
|
74
|
+
# @yieldparam message [Hash]
|
|
75
|
+
#
|
|
17
76
|
def append_link_filter(filter = nil, &block)
|
|
18
77
|
if block_given?
|
|
19
78
|
@link_filters << block
|
|
@@ -22,6 +81,30 @@ module DaimonSkycrawlers
|
|
|
22
81
|
end
|
|
23
82
|
end
|
|
24
83
|
|
|
84
|
+
#
|
|
85
|
+
# Register block to process element found by DaimonSkycrawlers::Processor::Spider#link_rules
|
|
86
|
+
#
|
|
87
|
+
# @yield [element]
|
|
88
|
+
# @yieldparam element [Object]
|
|
89
|
+
# @example Default
|
|
90
|
+
# ->(element) { element["href"] }
|
|
91
|
+
#
|
|
92
|
+
def extract_link(&block)
|
|
93
|
+
@extract_link = block
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
#
|
|
97
|
+
# Register block to process element found by DaimonSkycrawlers::Processor::Spider#next_page_link_rules
|
|
98
|
+
#
|
|
99
|
+
# @yield [element]
|
|
100
|
+
# @yieldparam element [Object]
|
|
101
|
+
# @example Default
|
|
102
|
+
# ->(element) { element["href"] }
|
|
103
|
+
#
|
|
104
|
+
def extract_next_page_link(&block)
|
|
105
|
+
@extract_next_page_link = block
|
|
106
|
+
end
|
|
107
|
+
|
|
25
108
|
#
|
|
26
109
|
# @param [Hash] message Must have key :url, :depth
|
|
27
110
|
#
|
|
@@ -35,8 +118,14 @@ module DaimonSkycrawlers
|
|
|
35
118
|
new_message = {
|
|
36
119
|
depth: depth - 1,
|
|
37
120
|
}
|
|
121
|
+
link_message = new_message.merge(@link_message)
|
|
38
122
|
links.each do |url|
|
|
39
|
-
enqueue_url(url,
|
|
123
|
+
enqueue_url(url, link_message)
|
|
124
|
+
end
|
|
125
|
+
next_page_url = find_next_page_link
|
|
126
|
+
if next_page_link
|
|
127
|
+
next_page_link_message = new_message.merge(@next_page_link_message)
|
|
128
|
+
enqueue_url(next_page_url, next_page_link_message)
|
|
40
129
|
end
|
|
41
130
|
end
|
|
42
131
|
|
|
@@ -49,13 +138,20 @@ module DaimonSkycrawlers
|
|
|
49
138
|
end
|
|
50
139
|
|
|
51
140
|
def retrieve_links
|
|
52
|
-
urls = @doc.search(
|
|
53
|
-
element
|
|
141
|
+
urls = @doc.search(*link_rules).map do |element|
|
|
142
|
+
@extract_next_page_link.call(element)
|
|
54
143
|
end
|
|
55
144
|
urls.uniq!
|
|
56
145
|
apply_link_filters(urls) || []
|
|
57
146
|
end
|
|
58
147
|
|
|
148
|
+
def next_page_link
|
|
149
|
+
return unless next_page_link_rules
|
|
150
|
+
element = @doc.at(*next_page_link_rules)
|
|
151
|
+
return unless element
|
|
152
|
+
@extract_next_page_link.call(element)
|
|
153
|
+
end
|
|
154
|
+
|
|
59
155
|
def apply_link_filters(urls)
|
|
60
156
|
return if urls.nil?
|
|
61
157
|
return if urls.empty?
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.10.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- daimon developers
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-11-
|
|
11
|
+
date: 2016-11-29 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -407,7 +407,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
407
407
|
version: '0'
|
|
408
408
|
requirements: []
|
|
409
409
|
rubyforge_project:
|
|
410
|
-
rubygems_version: 2.
|
|
410
|
+
rubygems_version: 2.5.1
|
|
411
411
|
signing_key:
|
|
412
412
|
specification_version: 4
|
|
413
413
|
summary: This is a crawler framework.
|