daimon_skycrawlers 1.0.0.pre.rc2 → 1.0.0.pre.rc3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 648bff4fe2019f82836bcf21c75dc6e29d45abc8
4
- data.tar.gz: f7691acca87a0686190171806a6a07aaa8f4015f
3
+ metadata.gz: e324bf361f8fca59312726836b0239decc958752
4
+ data.tar.gz: c6195b2168b0440a456ed2a3e1b4f18a75e65dca
5
5
  SHA512:
6
- metadata.gz: 378fe0060bacead511a87f637702cd5d24ce79c5acf3cba0bfac123d9dfc2cfcde41261a813dce69f9e11126d988df609a92df126f82235610db7c5a7332539b
7
- data.tar.gz: '0790c05562fc71542a1cf33ca0971bbf356659d20012f7968c6c4ffd9ec9df521f0e6d009aa8256ec05a7d1b3e494baaec187d1faa8bbd34102fb73875edcfd0'
6
+ metadata.gz: aaff4577b84b6842daf5a308dac8c001618c7fb34f6794bed7434ec253d15dd21b0fd9fa14d2fac2535f33507131cf15f5c950edfca369f02c92f778e79ddd17
7
+ data.tar.gz: 73c947c3ee5c9e60a0370273ce35128e6cb18e8606a5d8d3efc6bad12bbd3d203aeb5cf594a6eccf0a0231f9e03d1a8c47d35f64e164e0c35720e6a358c715fc
@@ -111,6 +111,8 @@ module DaimonSkycrawlers
111
111
  # 1. Download(fetch) data from given URL
112
112
  # 1. Run post processes (store downloaded data to storage)
113
113
  #
114
+ # @param message [Hash] parameters for crawler
115
+ #
114
116
  def process(message, &block)
115
117
  @skipped = false
116
118
  @n_processed_urls += 1
@@ -23,7 +23,8 @@ module DaimonSkycrawlers
23
23
  #
24
24
  def call(message, connection: nil)
25
25
  url = normalize_url(message[:url])
26
- page = storage.read(url, message)
26
+ message[:url] = url
27
+ page = storage.read(message)
27
28
  return true unless page
28
29
  if connection
29
30
  response = connection.head(url)
@@ -17,6 +17,11 @@ module DaimonSkycrawlers
17
17
  include DaimonSkycrawlers::Callbacks
18
18
  include DaimonSkycrawlers::Configurable
19
19
 
20
+ # @!attribute [w] storage
21
+ # Set storage to crawler instance.
22
+ # @return [void]
23
+ attr_writer :storage
24
+
20
25
  def initialize
21
26
  super
22
27
  @skipped = false
@@ -11,8 +11,7 @@ module DaimonSkycrawlers
11
11
  # Display page information
12
12
  #
13
13
  def call(message)
14
- url = message[:url]
15
- page = storage.read(url, message)
14
+ page = storage.read(message)
16
15
  headers = JSON.parse(page.headers)
17
16
  headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
18
17
  dumped_message = <<LOG
@@ -99,10 +99,13 @@ module DaimonSkycrawlers
99
99
  # @param message [Hash] Must have key :url, :depth
100
100
  #
101
101
  def call(message)
102
- key_url = message[:url]
103
102
  depth = Integer(message[:depth] || 2)
104
103
  return if depth <= 1
105
- page = storage.read(key_url, message)
104
+ page = storage.read(message)
105
+ unless page
106
+ log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
107
+ return
108
+ end
106
109
  @doc = Nokogiri::HTML(page.body)
107
110
  new_message = {
108
111
  depth: depth - 1,
@@ -111,8 +114,8 @@ module DaimonSkycrawlers
111
114
  links.each do |url|
112
115
  enqueue_url(url, link_message)
113
116
  end
114
- next_page_url = find_next_page_link
115
- if next_page_link
117
+ next_page_url = next_page_link
118
+ if next_page_url
116
119
  next_page_link_message = new_message.merge(@next_page_link_message)
117
120
  enqueue_url(next_page_url, next_page_link_message)
118
121
  end
@@ -128,7 +131,7 @@ module DaimonSkycrawlers
128
131
 
129
132
  def retrieve_links
130
133
  urls = @doc.search(*link_rules).map do |element|
131
- @extract_next_page_link.call(element)
134
+ @extract_link.call(element)
132
135
  end
133
136
  urls.uniq!
134
137
  apply_link_filters(urls) || []
@@ -29,9 +29,9 @@ module DaimonSkycrawlers
29
29
  #
30
30
  # Override this method in subclass
31
31
  #
32
- # @param url [String] the key to find data in storage
32
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
33
33
  #
34
- def read(url, message = {})
34
+ def read(message = {})
35
35
  raise "Implement this in subclass"
36
36
  end
37
37
  end
@@ -39,9 +39,11 @@ module DaimonSkycrawlers
39
39
  #
40
40
  # Read data from files under base directory
41
41
  #
42
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
42
43
  # @return [DaimonSkycrawlers::Storage::File::Page]
43
44
  #
44
- def read(url, message)
45
+ def read(message)
46
+ url = message[:url]
45
47
  key = message[:key]
46
48
  headers = JSON.parse(headers_path(url, key).read)
47
49
  body = body_path(url, key).read
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
16
16
  #
17
17
  # Read nothing
18
18
  #
19
- def read(url, message = {})
19
+ def read(message = {})
20
20
  end
21
21
  end
22
22
  end
@@ -39,10 +39,10 @@ module DaimonSkycrawlers
39
39
  #
40
40
  # Fetch page identified by url
41
41
  #
42
- # @param url [String] identity of the page
43
- # @param message [Hash] this hash may include `:key` to find page
42
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
44
43
  #
45
- def read(url, message = {})
44
+ def read(message = {})
45
+ url = message[:url]
46
46
  key = message[:key]
47
47
  if key
48
48
  Page.where(key: key).order(updated_at: :desc).limit(1).first
@@ -2,5 +2,5 @@ module DaimonSkycrawlers
2
2
  #
3
3
  # Version of this library
4
4
  #
5
- VERSION = "1.0.0-rc2"
5
+ VERSION = "1.0.0-rc3"
6
6
  end
@@ -6,8 +6,7 @@ require "daimon_skycrawlers/processor/spider"
6
6
  class AmazonRanking < DaimonSkycrawlers::Processor::Base
7
7
  Item = Struct.new(:rank, :name, :url, :star, :review)
8
8
  def call(message)
9
- url = message[:url]
10
- page = storage.read(url)
9
+ page = storage.read(message)
11
10
  doc = Nokogiri::HTML(page.body)
12
11
  ranking = []
13
12
  doc.search(".zg_itemRow").each do |item|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
7
7
  class ItpProcessor < DaimonSkycrawlers::Processor::Base
8
8
  def call(message)
9
9
  key_url = message[:url]
10
- page = storage.read(key_url)
10
+ page = storage.read(message)
11
11
  @doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
12
12
  ItpShop.transaction do
13
13
  prepare_shops do |shop|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.rc2
4
+ version: 1.0.0.pre.rc3
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-07 00:00:00.000000000 Z
11
+ date: 2017-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -462,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
462
462
  version: 1.3.1
463
463
  requirements: []
464
464
  rubyforge_project:
465
- rubygems_version: 2.6.8
465
+ rubygems_version: 2.6.4
466
466
  signing_key:
467
467
  specification_version: 4
468
468
  summary: This is a crawler framework.