daimon_skycrawlers 1.0.0.pre.rc2 → 1.0.0.pre.rc3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/crawler/base.rb +2 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -1
- data/lib/daimon_skycrawlers/processor/base.rb +5 -0
- data/lib/daimon_skycrawlers/processor/default.rb +1 -2
- data/lib/daimon_skycrawlers/processor/spider.rb +8 -5
- data/lib/daimon_skycrawlers/storage/base.rb +2 -2
- data/lib/daimon_skycrawlers/storage/file.rb +3 -1
- data/lib/daimon_skycrawlers/storage/null.rb +1 -1
- data/lib/daimon_skycrawlers/storage/rdb.rb +3 -3
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -2
- data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e324bf361f8fca59312726836b0239decc958752
|
4
|
+
data.tar.gz: c6195b2168b0440a456ed2a3e1b4f18a75e65dca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aaff4577b84b6842daf5a308dac8c001618c7fb34f6794bed7434ec253d15dd21b0fd9fa14d2fac2535f33507131cf15f5c950edfca369f02c92f778e79ddd17
|
7
|
+
data.tar.gz: 73c947c3ee5c9e60a0370273ce35128e6cb18e8606a5d8d3efc6bad12bbd3d203aeb5cf594a6eccf0a0231f9e03d1a8c47d35f64e164e0c35720e6a358c715fc
|
@@ -111,6 +111,8 @@ module DaimonSkycrawlers
|
|
111
111
|
# 1. Download(fetch) data from given URL
|
112
112
|
# 1. Run post processes (store downloaded data to storage)
|
113
113
|
#
|
114
|
+
# @param message [Hash] parameters for crawler
|
115
|
+
#
|
114
116
|
def process(message, &block)
|
115
117
|
@skipped = false
|
116
118
|
@n_processed_urls += 1
|
@@ -23,7 +23,8 @@ module DaimonSkycrawlers
|
|
23
23
|
#
|
24
24
|
def call(message, connection: nil)
|
25
25
|
url = normalize_url(message[:url])
|
26
|
-
|
26
|
+
message[:url] = url
|
27
|
+
page = storage.read(message)
|
27
28
|
return true unless page
|
28
29
|
if connection
|
29
30
|
response = connection.head(url)
|
@@ -17,6 +17,11 @@ module DaimonSkycrawlers
|
|
17
17
|
include DaimonSkycrawlers::Callbacks
|
18
18
|
include DaimonSkycrawlers::Configurable
|
19
19
|
|
20
|
+
# @!attribute [w] storage
|
21
|
+
# Set storage to crawler instance.
|
22
|
+
# @return [void]
|
23
|
+
attr_writer :storage
|
24
|
+
|
20
25
|
def initialize
|
21
26
|
super
|
22
27
|
@skipped = false
|
@@ -11,8 +11,7 @@ module DaimonSkycrawlers
|
|
11
11
|
# Display page information
|
12
12
|
#
|
13
13
|
def call(message)
|
14
|
-
|
15
|
-
page = storage.read(url, message)
|
14
|
+
page = storage.read(message)
|
16
15
|
headers = JSON.parse(page.headers)
|
17
16
|
headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
|
18
17
|
dumped_message = <<LOG
|
@@ -99,10 +99,13 @@ module DaimonSkycrawlers
|
|
99
99
|
# @param message [Hash] Must have key :url, :depth
|
100
100
|
#
|
101
101
|
def call(message)
|
102
|
-
key_url = message[:url]
|
103
102
|
depth = Integer(message[:depth] || 2)
|
104
103
|
return if depth <= 1
|
105
|
-
page = storage.read(
|
104
|
+
page = storage.read(message)
|
105
|
+
unless page
|
106
|
+
log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
|
107
|
+
return
|
108
|
+
end
|
106
109
|
@doc = Nokogiri::HTML(page.body)
|
107
110
|
new_message = {
|
108
111
|
depth: depth - 1,
|
@@ -111,8 +114,8 @@ module DaimonSkycrawlers
|
|
111
114
|
links.each do |url|
|
112
115
|
enqueue_url(url, link_message)
|
113
116
|
end
|
114
|
-
next_page_url =
|
115
|
-
if
|
117
|
+
next_page_url = next_page_link
|
118
|
+
if next_page_url
|
116
119
|
next_page_link_message = new_message.merge(@next_page_link_message)
|
117
120
|
enqueue_url(next_page_url, next_page_link_message)
|
118
121
|
end
|
@@ -128,7 +131,7 @@ module DaimonSkycrawlers
|
|
128
131
|
|
129
132
|
def retrieve_links
|
130
133
|
urls = @doc.search(*link_rules).map do |element|
|
131
|
-
@
|
134
|
+
@extract_link.call(element)
|
132
135
|
end
|
133
136
|
urls.uniq!
|
134
137
|
apply_link_filters(urls) || []
|
@@ -29,9 +29,9 @@ module DaimonSkycrawlers
|
|
29
29
|
#
|
30
30
|
# Override this method in subclass
|
31
31
|
#
|
32
|
-
# @param
|
32
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
33
33
|
#
|
34
|
-
def read(
|
34
|
+
def read(message = {})
|
35
35
|
raise "Implement this in subclass"
|
36
36
|
end
|
37
37
|
end
|
@@ -39,9 +39,11 @@ module DaimonSkycrawlers
|
|
39
39
|
#
|
40
40
|
# Read data from files under base directory
|
41
41
|
#
|
42
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
42
43
|
# @return [DaimonSkycrawlers::Storage::File::Page]
|
43
44
|
#
|
44
|
-
def read(
|
45
|
+
def read(message)
|
46
|
+
url = message[:url]
|
45
47
|
key = message[:key]
|
46
48
|
headers = JSON.parse(headers_path(url, key).read)
|
47
49
|
body = body_path(url, key).read
|
@@ -39,10 +39,10 @@ module DaimonSkycrawlers
|
|
39
39
|
#
|
40
40
|
# Fetch page identified by url
|
41
41
|
#
|
42
|
-
# @param
|
43
|
-
# @param message [Hash] this hash may include `:key` to find page
|
42
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
44
43
|
#
|
45
|
-
def read(
|
44
|
+
def read(message = {})
|
45
|
+
url = message[:url]
|
46
46
|
key = message[:key]
|
47
47
|
if key
|
48
48
|
Page.where(key: key).order(updated_at: :desc).limit(1).first
|
@@ -6,8 +6,7 @@ require "daimon_skycrawlers/processor/spider"
|
|
6
6
|
class AmazonRanking < DaimonSkycrawlers::Processor::Base
|
7
7
|
Item = Struct.new(:rank, :name, :url, :star, :review)
|
8
8
|
def call(message)
|
9
|
-
|
10
|
-
page = storage.read(url)
|
9
|
+
page = storage.read(message)
|
11
10
|
doc = Nokogiri::HTML(page.body)
|
12
11
|
ranking = []
|
13
12
|
doc.search(".zg_itemRow").each do |item|
|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
|
|
7
7
|
class ItpProcessor < DaimonSkycrawlers::Processor::Base
|
8
8
|
def call(message)
|
9
9
|
key_url = message[:url]
|
10
|
-
page = storage.read(
|
10
|
+
page = storage.read(message)
|
11
11
|
@doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
|
12
12
|
ItpShop.transaction do
|
13
13
|
prepare_shops do |shop|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.pre.
|
4
|
+
version: 1.0.0.pre.rc3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -462,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
462
462
|
version: 1.3.1
|
463
463
|
requirements: []
|
464
464
|
rubyforge_project:
|
465
|
-
rubygems_version: 2.6.
|
465
|
+
rubygems_version: 2.6.4
|
466
466
|
signing_key:
|
467
467
|
specification_version: 4
|
468
468
|
summary: This is a crawler framework.
|