daimon_skycrawlers 1.0.0.pre.rc2 → 1.0.0.pre.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 648bff4fe2019f82836bcf21c75dc6e29d45abc8
4
- data.tar.gz: f7691acca87a0686190171806a6a07aaa8f4015f
3
+ metadata.gz: e324bf361f8fca59312726836b0239decc958752
4
+ data.tar.gz: c6195b2168b0440a456ed2a3e1b4f18a75e65dca
5
5
  SHA512:
6
- metadata.gz: 378fe0060bacead511a87f637702cd5d24ce79c5acf3cba0bfac123d9dfc2cfcde41261a813dce69f9e11126d988df609a92df126f82235610db7c5a7332539b
7
- data.tar.gz: '0790c05562fc71542a1cf33ca0971bbf356659d20012f7968c6c4ffd9ec9df521f0e6d009aa8256ec05a7d1b3e494baaec187d1faa8bbd34102fb73875edcfd0'
6
+ metadata.gz: aaff4577b84b6842daf5a308dac8c001618c7fb34f6794bed7434ec253d15dd21b0fd9fa14d2fac2535f33507131cf15f5c950edfca369f02c92f778e79ddd17
7
+ data.tar.gz: 73c947c3ee5c9e60a0370273ce35128e6cb18e8606a5d8d3efc6bad12bbd3d203aeb5cf594a6eccf0a0231f9e03d1a8c47d35f64e164e0c35720e6a358c715fc
@@ -111,6 +111,8 @@ module DaimonSkycrawlers
111
111
  # 1. Download(fetch) data from given URL
112
112
  # 1. Run post processes (store downloaded data to storage)
113
113
  #
114
+ # @param message [Hash] parameters for crawler
115
+ #
114
116
  def process(message, &block)
115
117
  @skipped = false
116
118
  @n_processed_urls += 1
@@ -23,7 +23,8 @@ module DaimonSkycrawlers
23
23
  #
24
24
  def call(message, connection: nil)
25
25
  url = normalize_url(message[:url])
26
- page = storage.read(url, message)
26
+ message[:url] = url
27
+ page = storage.read(message)
27
28
  return true unless page
28
29
  if connection
29
30
  response = connection.head(url)
@@ -17,6 +17,11 @@ module DaimonSkycrawlers
17
17
  include DaimonSkycrawlers::Callbacks
18
18
  include DaimonSkycrawlers::Configurable
19
19
 
20
+ # @!attribute [w] storage
21
+ # Set storage to crawler instance.
22
+ # @return [void]
23
+ attr_writer :storage
24
+
20
25
  def initialize
21
26
  super
22
27
  @skipped = false
@@ -11,8 +11,7 @@ module DaimonSkycrawlers
11
11
  # Display page information
12
12
  #
13
13
  def call(message)
14
- url = message[:url]
15
- page = storage.read(url, message)
14
+ page = storage.read(message)
16
15
  headers = JSON.parse(page.headers)
17
16
  headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
18
17
  dumped_message = <<LOG
@@ -99,10 +99,13 @@ module DaimonSkycrawlers
99
99
  # @param message [Hash] Must have key :url, :depth
100
100
  #
101
101
  def call(message)
102
- key_url = message[:url]
103
102
  depth = Integer(message[:depth] || 2)
104
103
  return if depth <= 1
105
- page = storage.read(key_url, message)
104
+ page = storage.read(message)
105
+ unless page
106
+ log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
107
+ return
108
+ end
106
109
  @doc = Nokogiri::HTML(page.body)
107
110
  new_message = {
108
111
  depth: depth - 1,
@@ -111,8 +114,8 @@ module DaimonSkycrawlers
111
114
  links.each do |url|
112
115
  enqueue_url(url, link_message)
113
116
  end
114
- next_page_url = find_next_page_link
115
- if next_page_link
117
+ next_page_url = next_page_link
118
+ if next_page_url
116
119
  next_page_link_message = new_message.merge(@next_page_link_message)
117
120
  enqueue_url(next_page_url, next_page_link_message)
118
121
  end
@@ -128,7 +131,7 @@ module DaimonSkycrawlers
128
131
 
129
132
  def retrieve_links
130
133
  urls = @doc.search(*link_rules).map do |element|
131
- @extract_next_page_link.call(element)
134
+ @extract_link.call(element)
132
135
  end
133
136
  urls.uniq!
134
137
  apply_link_filters(urls) || []
@@ -29,9 +29,9 @@ module DaimonSkycrawlers
29
29
  #
30
30
  # Override this method in subclass
31
31
  #
32
- # @param url [String] the key to find data in storage
32
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
33
33
  #
34
- def read(url, message = {})
34
+ def read(message = {})
35
35
  raise "Implement this in subclass"
36
36
  end
37
37
  end
@@ -39,9 +39,11 @@ module DaimonSkycrawlers
39
39
  #
40
40
  # Read data from files under base directory
41
41
  #
42
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
42
43
  # @return [DaimonSkycrawlers::Storage::File::Page]
43
44
  #
44
- def read(url, message)
45
+ def read(message)
46
+ url = message[:url]
45
47
  key = message[:key]
46
48
  headers = JSON.parse(headers_path(url, key).read)
47
49
  body = body_path(url, key).read
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
16
16
  #
17
17
  # Read nothing
18
18
  #
19
- def read(url, message = {})
19
+ def read(message = {})
20
20
  end
21
21
  end
22
22
  end
@@ -39,10 +39,10 @@ module DaimonSkycrawlers
39
39
  #
40
40
  # Fetch page identified by url
41
41
  #
42
- # @param url [String] identity of the page
43
- # @param message [Hash] this hash may include `:key` to find page
42
+ # @param message [Hash] this hash can include `:url`, `:key` to find page
44
43
  #
45
- def read(url, message = {})
44
+ def read(message = {})
45
+ url = message[:url]
46
46
  key = message[:key]
47
47
  if key
48
48
  Page.where(key: key).order(updated_at: :desc).limit(1).first
@@ -2,5 +2,5 @@ module DaimonSkycrawlers
2
2
  #
3
3
  # Version of this library
4
4
  #
5
- VERSION = "1.0.0-rc2"
5
+ VERSION = "1.0.0-rc3"
6
6
  end
@@ -6,8 +6,7 @@ require "daimon_skycrawlers/processor/spider"
6
6
  class AmazonRanking < DaimonSkycrawlers::Processor::Base
7
7
  Item = Struct.new(:rank, :name, :url, :star, :review)
8
8
  def call(message)
9
- url = message[:url]
10
- page = storage.read(url)
9
+ page = storage.read(message)
11
10
  doc = Nokogiri::HTML(page.body)
12
11
  ranking = []
13
12
  doc.search(".zg_itemRow").each do |item|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
7
7
  class ItpProcessor < DaimonSkycrawlers::Processor::Base
8
8
  def call(message)
9
9
  key_url = message[:url]
10
- page = storage.read(key_url)
10
+ page = storage.read(message)
11
11
  @doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
12
12
  ItpShop.transaction do
13
13
  prepare_shops do |shop|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.rc2
4
+ version: 1.0.0.pre.rc3
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-07 00:00:00.000000000 Z
11
+ date: 2017-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -462,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
462
462
  version: 1.3.1
463
463
  requirements: []
464
464
  rubyforge_project:
465
- rubygems_version: 2.6.8
465
+ rubygems_version: 2.6.4
466
466
  signing_key:
467
467
  specification_version: 4
468
468
  summary: This is a crawler framework.