daimon_skycrawlers 1.0.0.pre.rc2 → 1.0.0.pre.rc3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/crawler/base.rb +2 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -1
- data/lib/daimon_skycrawlers/processor/base.rb +5 -0
- data/lib/daimon_skycrawlers/processor/default.rb +1 -2
- data/lib/daimon_skycrawlers/processor/spider.rb +8 -5
- data/lib/daimon_skycrawlers/storage/base.rb +2 -2
- data/lib/daimon_skycrawlers/storage/file.rb +3 -1
- data/lib/daimon_skycrawlers/storage/null.rb +1 -1
- data/lib/daimon_skycrawlers/storage/rdb.rb +3 -3
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -2
- data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e324bf361f8fca59312726836b0239decc958752
|
|
4
|
+
data.tar.gz: c6195b2168b0440a456ed2a3e1b4f18a75e65dca
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: aaff4577b84b6842daf5a308dac8c001618c7fb34f6794bed7434ec253d15dd21b0fd9fa14d2fac2535f33507131cf15f5c950edfca369f02c92f778e79ddd17
|
|
7
|
+
data.tar.gz: 73c947c3ee5c9e60a0370273ce35128e6cb18e8606a5d8d3efc6bad12bbd3d203aeb5cf594a6eccf0a0231f9e03d1a8c47d35f64e164e0c35720e6a358c715fc
|
|
@@ -111,6 +111,8 @@ module DaimonSkycrawlers
|
|
|
111
111
|
# 1. Download(fetch) data from given URL
|
|
112
112
|
# 1. Run post processes (store downloaded data to storage)
|
|
113
113
|
#
|
|
114
|
+
# @param message [Hash] parameters for crawler
|
|
115
|
+
#
|
|
114
116
|
def process(message, &block)
|
|
115
117
|
@skipped = false
|
|
116
118
|
@n_processed_urls += 1
|
|
@@ -23,7 +23,8 @@ module DaimonSkycrawlers
|
|
|
23
23
|
#
|
|
24
24
|
def call(message, connection: nil)
|
|
25
25
|
url = normalize_url(message[:url])
|
|
26
|
-
|
|
26
|
+
message[:url] = url
|
|
27
|
+
page = storage.read(message)
|
|
27
28
|
return true unless page
|
|
28
29
|
if connection
|
|
29
30
|
response = connection.head(url)
|
|
@@ -17,6 +17,11 @@ module DaimonSkycrawlers
|
|
|
17
17
|
include DaimonSkycrawlers::Callbacks
|
|
18
18
|
include DaimonSkycrawlers::Configurable
|
|
19
19
|
|
|
20
|
+
# @!attribute [w] storage
|
|
21
|
+
# Set storage to crawler instance.
|
|
22
|
+
# @return [void]
|
|
23
|
+
attr_writer :storage
|
|
24
|
+
|
|
20
25
|
def initialize
|
|
21
26
|
super
|
|
22
27
|
@skipped = false
|
|
@@ -11,8 +11,7 @@ module DaimonSkycrawlers
|
|
|
11
11
|
# Display page information
|
|
12
12
|
#
|
|
13
13
|
def call(message)
|
|
14
|
-
|
|
15
|
-
page = storage.read(url, message)
|
|
14
|
+
page = storage.read(message)
|
|
16
15
|
headers = JSON.parse(page.headers)
|
|
17
16
|
headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
|
|
18
17
|
dumped_message = <<LOG
|
|
@@ -99,10 +99,13 @@ module DaimonSkycrawlers
|
|
|
99
99
|
# @param message [Hash] Must have key :url, :depth
|
|
100
100
|
#
|
|
101
101
|
def call(message)
|
|
102
|
-
key_url = message[:url]
|
|
103
102
|
depth = Integer(message[:depth] || 2)
|
|
104
103
|
return if depth <= 1
|
|
105
|
-
page = storage.read(
|
|
104
|
+
page = storage.read(message)
|
|
105
|
+
unless page
|
|
106
|
+
log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
|
|
107
|
+
return
|
|
108
|
+
end
|
|
106
109
|
@doc = Nokogiri::HTML(page.body)
|
|
107
110
|
new_message = {
|
|
108
111
|
depth: depth - 1,
|
|
@@ -111,8 +114,8 @@ module DaimonSkycrawlers
|
|
|
111
114
|
links.each do |url|
|
|
112
115
|
enqueue_url(url, link_message)
|
|
113
116
|
end
|
|
114
|
-
next_page_url =
|
|
115
|
-
if
|
|
117
|
+
next_page_url = next_page_link
|
|
118
|
+
if next_page_url
|
|
116
119
|
next_page_link_message = new_message.merge(@next_page_link_message)
|
|
117
120
|
enqueue_url(next_page_url, next_page_link_message)
|
|
118
121
|
end
|
|
@@ -128,7 +131,7 @@ module DaimonSkycrawlers
|
|
|
128
131
|
|
|
129
132
|
def retrieve_links
|
|
130
133
|
urls = @doc.search(*link_rules).map do |element|
|
|
131
|
-
@
|
|
134
|
+
@extract_link.call(element)
|
|
132
135
|
end
|
|
133
136
|
urls.uniq!
|
|
134
137
|
apply_link_filters(urls) || []
|
|
@@ -29,9 +29,9 @@ module DaimonSkycrawlers
|
|
|
29
29
|
#
|
|
30
30
|
# Override this method in subclass
|
|
31
31
|
#
|
|
32
|
-
# @param
|
|
32
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
|
33
33
|
#
|
|
34
|
-
def read(
|
|
34
|
+
def read(message = {})
|
|
35
35
|
raise "Implement this in subclass"
|
|
36
36
|
end
|
|
37
37
|
end
|
|
@@ -39,9 +39,11 @@ module DaimonSkycrawlers
|
|
|
39
39
|
#
|
|
40
40
|
# Read data from files under base directory
|
|
41
41
|
#
|
|
42
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
|
42
43
|
# @return [DaimonSkycrawlers::Storage::File::Page]
|
|
43
44
|
#
|
|
44
|
-
def read(
|
|
45
|
+
def read(message)
|
|
46
|
+
url = message[:url]
|
|
45
47
|
key = message[:key]
|
|
46
48
|
headers = JSON.parse(headers_path(url, key).read)
|
|
47
49
|
body = body_path(url, key).read
|
|
@@ -39,10 +39,10 @@ module DaimonSkycrawlers
|
|
|
39
39
|
#
|
|
40
40
|
# Fetch page identified by url
|
|
41
41
|
#
|
|
42
|
-
# @param
|
|
43
|
-
# @param message [Hash] this hash may include `:key` to find page
|
|
42
|
+
# @param message [Hash] this hash can include `:url`, `:key` to find page
|
|
44
43
|
#
|
|
45
|
-
def read(
|
|
44
|
+
def read(message = {})
|
|
45
|
+
url = message[:url]
|
|
46
46
|
key = message[:key]
|
|
47
47
|
if key
|
|
48
48
|
Page.where(key: key).order(updated_at: :desc).limit(1).first
|
|
@@ -6,8 +6,7 @@ require "daimon_skycrawlers/processor/spider"
|
|
|
6
6
|
class AmazonRanking < DaimonSkycrawlers::Processor::Base
|
|
7
7
|
Item = Struct.new(:rank, :name, :url, :star, :review)
|
|
8
8
|
def call(message)
|
|
9
|
-
|
|
10
|
-
page = storage.read(url)
|
|
9
|
+
page = storage.read(message)
|
|
11
10
|
doc = Nokogiri::HTML(page.body)
|
|
12
11
|
ranking = []
|
|
13
12
|
doc.search(".zg_itemRow").each do |item|
|
|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
|
|
|
7
7
|
class ItpProcessor < DaimonSkycrawlers::Processor::Base
|
|
8
8
|
def call(message)
|
|
9
9
|
key_url = message[:url]
|
|
10
|
-
page = storage.read(
|
|
10
|
+
page = storage.read(message)
|
|
11
11
|
@doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
|
|
12
12
|
ItpShop.transaction do
|
|
13
13
|
prepare_shops do |shop|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.0.pre.
|
|
4
|
+
version: 1.0.0.pre.rc3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- daimon developers
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2017-02-
|
|
11
|
+
date: 2017-02-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -462,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
462
462
|
version: 1.3.1
|
|
463
463
|
requirements: []
|
|
464
464
|
rubyforge_project:
|
|
465
|
-
rubygems_version: 2.6.
|
|
465
|
+
rubygems_version: 2.6.4
|
|
466
466
|
signing_key:
|
|
467
467
|
specification_version: 4
|
|
468
468
|
summary: This is a crawler framework.
|