apollo-crawler 0.1.24 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71fb379b6ae32ceb79e40cce451c8a3646278d32
4
- data.tar.gz: 08fdec629298945a86993b91be3a743b880ad4a0
3
+ metadata.gz: 79e9ecdfed577a1ce13b74b24d6d5bc26bf75843
4
+ data.tar.gz: 6d93c6da6316d4666ddc5e434bab1caadc213ba3
5
5
  SHA512:
6
- metadata.gz: d0789d2ef99358144c90d148c378d9bf53e3084b326ede425ed7ff171ab450a4ed9de7a86494dbf49992ed235807f92e6a795cc52676bd61280a006408fc4e90
7
- data.tar.gz: 16ac99fc7e192fe137348c6d364e730c9c0071f274f200b395746c4247bb3958c7238b315b571a730014b5f8286602bbc67eb9f26db30c217440e15401d3ac95
6
+ metadata.gz: 863d10a255722bd53c9ee998e2886fd86d04cf7808284323d33f7da1fe77fc99ac0874f1e2a61f20c8656e039bc3bac2d2a9cef911e9b6e6d12266e91636b3bc
7
+ data.tar.gz: db77a2d4606dcecbec1ae2e0d872cc41f6fea64012280e179b1147d8bbabfe7eef5446bf364978ee930f338cb367624e30a86e11fe0613e4499e03dcbc670e4b
@@ -57,5 +57,8 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
57
57
  # Program
58
58
  require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
59
59
 
60
+ # Scheduler
61
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/scheduler/schedulers')
62
+
60
63
  # Stores
61
64
  require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
@@ -18,5 +18,8 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
21
22
  require File.join(File.dirname(__FILE__), 'base_agent')
23
+ require File.join(File.dirname(__FILE__), 'domainer_agent')
24
+ require File.join(File.dirname(__FILE__), 'crawler_agent')
22
25
  require File.join(File.dirname(__FILE__), 'fetcher_agent')
@@ -18,6 +18,8 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+
21
23
  module Apollo
22
24
  module Agent
23
25
  class BaseAgent
@@ -0,0 +1,77 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+ require File.join(File.dirname(__FILE__), 'base_agent')
23
+
24
+ require File.join(File.dirname(__FILE__), '../crawler/crawlers')
25
+
26
+ require 'nokogiri'
27
+
28
+ module Apollo
29
+ module Agent
30
+ class CrawlerAgent < BaseAgent
31
+ attr_accessor :declarations
32
+
33
+ def initialize(amqp, opts={})
34
+ if(opts[:verbose])
35
+ puts "Initializing crawler agent..."
36
+ end
37
+
38
+ # Declarations
39
+ channel = amqp.create_channel
40
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
+
42
+ # Binding
43
+ declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
44
+ puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
45
+
46
+ msg = JSON.parse(payload)
47
+
48
+ request = msg['request']
49
+ response = msg['response']
50
+
51
+ # puts "PLANEEEER: #{msg.inspect}"
52
+
53
+ doc = Nokogiri::HTML(response['body'])
54
+ crawler = request['crawler_name'].constantize.new
55
+ data = crawler.extract_data(doc)
56
+ links = crawler.extract_links(doc)
57
+
58
+ # puts crawler.to_s
59
+ # puts res.inspect
60
+
61
+ if(metadata[:reply_to] != nil)
62
+ x = declarations[:exchanges][metadata[:reply_to]]
63
+
64
+ msg = {
65
+ :request => request,
66
+ :response => response,
67
+ :data => data,
68
+ :links => links
69
+ }
70
+
71
+ x.publish(msg.to_json)
72
+ end
73
+ end
74
+ end
75
+ end # class CrawlerAgent
76
+ end # module Agent
77
+ end # module Apollo
@@ -0,0 +1,51 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+ require File.join(File.dirname(__FILE__), 'base_agent')
23
+
24
+ require File.join(File.dirname(__FILE__), '../crawler/crawlers')
25
+
26
+ require 'nokogiri'
27
+
28
+ module Apollo
29
+ module Agent
30
+ class DomainerAgent < BaseAgent
31
+ attr_accessor :declarations
32
+
33
+ def initialize(amqp, opts={})
34
+ if(opts[:verbose])
35
+ puts "Initializing crawler agent..."
36
+ end
37
+
38
+ # Declarations
39
+ channel = amqp.create_channel
40
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
+
42
+ # Binding
43
+ declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
44
+ puts "DomainerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
45
+
46
+ msg = JSON.parse(payload)
47
+ end
48
+ end
49
+ end # class DomainerAgent
50
+ end # module Agent
51
+ end # module Apollo
@@ -0,0 +1,55 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ module Apollo
22
+ module Agent
23
+ def self.declare_exchanges(channel, opts={})
24
+ if(opts[:verbose])
25
+ puts "Declaring AMQP Exchanges"
26
+ end
27
+
28
+ # Exchanges
29
+ exchanges = {}
30
+ exchanges["crawler"] = channel.direct("crawler", :auto_delete => false, :durable => true)
31
+ exchanges["domainer"] = channel.direct("domainer", :auto_delete => false, :durable => true)
32
+ exchanges["fetcher"] = channel.direct("fetcher", :auto_delete => false, :durable => true)
33
+ exchanges["planner.crawled"] = channel.direct("planner.crawled", :auto_delete => false, :durable => true)
34
+ exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
35
+ exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
36
+
37
+ # Queues
38
+ queues = {}
39
+ queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
40
+ queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
41
+ queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
42
+ queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
43
+ queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
44
+ queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
45
+
46
+ # Compose res
47
+ res = {
48
+ :exchanges => exchanges,
49
+ :queues => queues
50
+ }
51
+
52
+ return res
53
+ end
54
+ end
55
+ end
@@ -18,13 +18,22 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
21
22
  require File.join(File.dirname(__FILE__), 'base_agent')
23
+
22
24
  require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
23
25
 
26
+ require 'digest/sha1'
27
+ require 'thread/pool'
28
+
24
29
  module Apollo
25
30
  module Agent
26
31
  class FetcherAgent < BaseAgent
32
+ THREAD_POOL_SIZE = 10
33
+
27
34
  attr_accessor :fetcher
35
+ attr_accessor :declarations
36
+ attr_accessor :thread_pool
28
37
 
29
38
  def initialize(amqp, opts={})
30
39
  self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -33,23 +42,64 @@ module Apollo
33
42
  puts "Initializing fetcher agent..."
34
43
  end
35
44
 
36
- ch = amqp.create_channel
37
- q = ch.queue("fetcher", :auto_delete => false, :durable => true)
38
- x = ch.default_exchange
45
+ thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
39
46
 
40
- q.subscribe do |delivery_info, metadata, payload|
41
- res = nil
47
+ # Declarations
48
+ channel = amqp.create_channel
49
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
42
50
 
43
- puts "Received #{payload}" if opts[:verbose]
44
-
45
- Thread.new do |t|
51
+ # Binding
52
+ declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
53
+ thread_pool.process {
46
54
  queued_url = JSON.parse(payload)
47
- # puts queued_url["url"]
48
- # res = Apollo::Fetcher::SmartFetcher::fetch(queued_url["url"])
49
- # puts "#{queued_url['url']} - " + res.inspect
50
- end
55
+ url = queued_url["url"]
56
+
57
+ puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
58
+
59
+ doc = nil
60
+ begin
61
+ doc = Apollo::Fetcher::SmartFetcher::fetch(url)
62
+ rescue Exception => e
63
+ puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
64
+ end
65
+
66
+ doc = get_fetched_doc(queued_url, doc, metadata, opts)
67
+
68
+ # send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
69
+
70
+ if(metadata && metadata[:reply_to])
71
+ exchange_name = metadata[:reply_to]
72
+
73
+ if(exchange_name != nil)
74
+ msg = get_response_msg(queued_url, doc)
75
+
76
+ x = declarations[:exchanges][exchange_name]
77
+ x.publish(msg.to_json)
78
+ end
79
+ end
80
+ }
51
81
  end
52
82
  end
83
+
84
+ def get_fetched_doc(queued_url, doc, metadata, opts={})
85
+ url = queued_url["url"]
86
+
87
+ res = Apollo::Model::RawDocument.new
88
+ res.headers = doc.headers
89
+ res.body = doc.body
90
+ res.sha_hash = Digest::SHA1.hexdigest(doc.body)
91
+ res.status = doc.status
92
+ res.url = url
93
+
94
+ return res
95
+ end
96
+
97
+ def get_response_msg(queued_url, doc)
98
+ return {
99
+ :request => queued_url,
100
+ :response => doc
101
+ }
102
+ end
53
103
  end # class FetcherAgent
54
104
  end # module Agent
55
105
  end # module Apollo
@@ -172,8 +172,8 @@ module Apollo
172
172
  def self.create_metadoc(url, doc)
173
173
  return {
174
174
  'url' => url,
175
- 'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
- 'hash' => Digest::SHA256.new.update(doc).hexdigest,
175
+ 'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
+ 'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
177
177
  'created_at' => Time.now.utc,
178
178
  'expires_at' => nil,
179
179
  'version' => 0
@@ -46,14 +46,7 @@ module Apollo
46
46
  end
47
47
 
48
48
  def extract_links(doc)
49
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
50
- url = BaseCrawler.try_get_url(self.url, node['href']).to_s
51
- next if url.nil?
52
-
53
- {
54
- :link => url
55
- }
56
- }
49
+ return []
57
50
  end
58
51
  end # class SlashdotCrawler
59
52
  end # module Crawler
@@ -24,6 +24,9 @@ require "open-uri"
24
24
  require 'faraday'
25
25
  require 'ipaddr'
26
26
 
27
+ # require 'resolv'
28
+ # require 'resolv-replace'
29
+
27
30
  module Apollo
28
31
  module Fetcher
29
32
  class BaseFetcher
@@ -36,7 +39,12 @@ module Apollo
36
39
  end
37
40
 
38
41
  def self.fetch(url, options = {})
39
- uri = URI.parse(url.to_s)
42
+ begin
43
+ uri = URI.parse(url.to_s)
44
+ rescue Exception => e
45
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
46
+ return nil
47
+ end
40
48
 
41
49
  # See https://github.com/lostisland/faraday
42
50
  conn = Faraday.new(:url => url) do |faraday|
@@ -46,12 +54,17 @@ module Apollo
46
54
  end
47
55
 
48
56
  # Make request
49
- res = conn.get(uri) do |request|
50
- request.headers = BaseFetcher.get_fake_headers(uri)
57
+ begin
58
+ res = conn.get(uri) do |request|
59
+ request.headers = BaseFetcher.get_fake_headers(uri)
60
+ end
61
+ rescue Exception => e
62
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
63
+ return nil
51
64
  end
52
65
 
53
66
  # Return result
54
- return res.body
67
+ return res
55
68
  end
56
69
  end # class BaseFetcher
57
70
  end # module Fetcher
@@ -54,5 +54,8 @@ require File.join(File.dirname(__FILE__), 'planner/planners')
54
54
  # Programs
55
55
  require File.join(File.dirname(__FILE__), 'program/programs')
56
56
 
57
+ # Programs
58
+ require File.join(File.dirname(__FILE__), 'scheduler/schedulers')
59
+
57
60
  # Stores
58
61
  require File.join(File.dirname(__FILE__), 'store/stores')
@@ -0,0 +1,37 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_model')
22
+
23
+ module Apollo
24
+ module Model
25
+ class DataSource < BaseModel
26
+ include Mongoid::Document
27
+ include Mongoid::Timestamps
28
+
29
+ store_in collection: "data_sources"
30
+
31
+ field :url
32
+
33
+ # Indexes
34
+ index({ created_at: 1, updated_at: 1, url: 1 })
35
+ end # class DataSource
36
+ end # module Model
37
+ end # module Apollo
@@ -0,0 +1,37 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_model')
22
+
23
+ module Apollo
24
+ module Model
25
+ class Domain < BaseModel
26
+ include Mongoid::Document
27
+ include Mongoid::Timestamps
28
+
29
+ store_in collection: "domains"
30
+
31
+ field :name
32
+
33
+ # Indexes
34
+ index({ created_at: 1, updated_at: 1, name: 1 })
35
+ end # class Domain
36
+ end # module Model
37
+ end # module Apollo
@@ -20,5 +20,7 @@
20
20
 
21
21
  require File.join(File.dirname(__FILE__), 'base_model')
22
22
  require File.join(File.dirname(__FILE__), 'crawler')
23
+ require File.join(File.dirname(__FILE__), 'data_source')
24
+ require File.join(File.dirname(__FILE__), 'domain')
23
25
  require File.join(File.dirname(__FILE__), 'queued_url')
24
- require File.join(File.dirname(__FILE__), 'raw_document')
26
+ require File.join(File.dirname(__FILE__), 'raw_document')
@@ -30,9 +30,10 @@ module Apollo
30
30
 
31
31
  field :url
32
32
  field :state
33
+ field :crawler_name
33
34
 
34
35
  # Indexes
35
- index({ created_at: 1, updated_at: 1 })
36
+ index({ created_at: 1, updated_at: 1, crawler_name: 1 })
36
37
  end # class QueuedUrl
37
38
  end # module Model
38
39
  end # module Apollo
@@ -28,10 +28,14 @@ module Apollo
28
28
 
29
29
  store_in collection: "raw_docs"
30
30
 
31
+ field :url
32
+ field :headers
31
33
  field :body
34
+ field :status
35
+ field :sha_hash
32
36
 
33
37
  # Indexes
34
- index({ created_at: 1, updated_at: 1 })
38
+ index({ created_at: 1, updated_at: 1, sha_hash: 1 })
35
39
  end # class RawDocument
36
40
  end # module Model
37
41
  end # module Apollo
@@ -22,43 +22,101 @@ require File.join(File.dirname(__FILE__),'base_planner')
22
22
 
23
23
  require File.join(File.dirname(__FILE__),'../model/models.rb')
24
24
 
25
+ require File.join(File.dirname(__FILE__),'../agent/exchanges.rb')
26
+
27
+ require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
28
+
29
+ require 'nokogiri'
30
+
25
31
  module Apollo
26
32
  module Planner
27
33
  class SmartPlanner < BasePlanner
28
34
  attr_accessor :amqp
29
35
  attr_accessor :mongo
36
+ attr_accessor :declarations
30
37
 
31
- def initialize(amqp=nil, mongo=nil)
38
+ def initialize(amqp=nil, mongo=nil, opts={})
32
39
  self.amqp = amqp
33
40
  self.mongo = mongo
34
- end
35
41
 
36
- def fetch_url(url, opts={})
37
- puts "AMQP fetching '#{url.inspect}'"
42
+ # Declarations
43
+ channel = amqp.create_channel
44
+ self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
38
45
 
39
- ch = amqp.create_channel
40
- x = ch.default_exchange
41
- x.publish(url.to_json, :routing_key => "fetcher")
46
+ # Bindings
47
+ declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
48
+ msg = JSON.parse(payload)
42
49
 
43
- end
50
+ request = msg['request']
51
+ response = msg['response']
44
52
 
45
- def fetch_queued_urls(opts={})
46
- urls = Apollo::Model::QueuedUrl.where({:state => :queued})
47
- return if urls.count < 1
53
+ doc = Apollo::Model::QueuedUrl.find(request["_id"])
54
+ doc.update_attributes(msg['request'])
55
+ doc.state = "fetched"
56
+ doc.save
57
+
58
+ doc = Apollo::Model::RawDocument.where(:url => request['url']).first
59
+ if doc
60
+ if doc.sha_hash != response['sha_hash']
61
+ puts "Removing old cached version of '#{request['url']}'" if opts[:verbose]
62
+
63
+ doc.destroy
64
+ doc = nil
65
+ else
66
+ puts "Using cached version of '#{request['url']}'" if opts[:verbose]
67
+ end
68
+ else
69
+ doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
70
+ end
71
+
72
+ if(doc.nil?)
73
+ doc = Apollo::Model::RawDocument.new(response).save
74
+ end
75
+
76
+ # Publish
77
+ declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
78
+ end
79
+
80
+ declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
81
+ msg = JSON.parse(payload)
48
82
 
83
+ puts "DOMAINED !!!"
84
+ end
85
+
86
+ declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
87
+ msg = JSON.parse(payload)
88
+
89
+ request = msg['request']
90
+ response = msg['response']
91
+ data = msg['data']
92
+ links = msg['links']
93
+ links = [] if links.nil?
94
+
95
+ links.each do |url|
96
+ link = url['link']
97
+
98
+ Apollo::Scheduler::BaseScheduler::schedule(link, request['crawler_name'])
99
+ end
100
+
101
+ # puts JSON.pretty_generate(data)
102
+ # puts JSON.pretty_generate(links)
103
+ end
104
+ end
105
+
106
+ def fetch_url(url, opts={})
49
107
  if(opts[:verbose])
50
- puts "Fetching Queued URLS"
108
+ puts "AMQP fetching '#{url.inspect}'"
51
109
  end
52
110
 
53
- puts "Count of URLs in Queue: #{urls.count}" if opts[:verbose]
111
+ # Publish
112
+ declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
113
+ end
54
114
 
55
- urls.each do |url|
56
- url.state = :fetching
57
- url.save
115
+ def fetch_queued_urls(opts={})
116
+ while url = Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
117
+ # puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
58
118
 
59
119
  fetch_url(url, opts)
60
-
61
- # puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
62
120
  end
63
121
  end
64
122
 
@@ -37,6 +37,7 @@ require 'eventmachine'
37
37
  require 'em-http'
38
38
 
39
39
  require 'fileutils'
40
+ require 'csv'
40
41
 
41
42
  require 'mongoid'
42
43
 
@@ -44,6 +45,8 @@ require File.join(File.dirname(__FILE__), '..', 'version')
44
45
 
45
46
  require File.join(File.dirname(__FILE__),'base_program')
46
47
 
48
+ require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
49
+
47
50
  module Apollo
48
51
  # Apollo Crawler Base Directory
49
52
  APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
@@ -97,29 +100,56 @@ module Apollo
97
100
  def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
98
101
  crawlers.each do |crawler|
99
102
  i = crawler.new
100
- puts "Queuying Crawler base URL: '#{i.url}'" if opts[:verbose]
101
- qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
102
- qu.save
103
+ Apollo::Scheduler::BaseScheduler::schedule(i.url, crawler)
103
104
  end
104
105
  end
105
106
 
107
+ def init_crawlers(amqp, opts={})
108
+ crawlers = []
109
+ crawlers << Apollo::Agent::CrawlerAgent.new(amqp, self.options)
110
+ end
111
+
112
+ def init_domainers(amqp, opts={})
113
+ domainers = []
114
+ domainers << Apollo::Agent::DomainerAgent.new(amqp, self.options)
115
+ end
116
+
106
117
  def init_fetchers(amqp, opts={})
107
118
  fetchers = []
108
119
  fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
109
120
 
121
+ # TODO: This should not be here!
110
122
  enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
111
-
112
- # ch = self.amqp.create_channel
113
- # x = ch.default_exchange
114
- # x.publish("Hello!", :routing_key => "fetcher")
115
123
  end
116
124
 
117
125
  def init_agents(amqp, opts={})
118
126
  puts "Initializing agents"
119
127
 
128
+ init_crawlers(amqp, opts)
129
+ init_domainers(amqp, opts)
120
130
  init_fetchers(amqp, opts)
121
131
  end
122
132
 
133
+ def init_domains(opts={})
134
+ path = File.join(File.dirname(__FILE__), "../../../tmp/top-1m.csv")
135
+ puts "#{path}"
136
+ if(File.exists?(path) == false)
137
+ return 0
138
+ end
139
+
140
+ Thread::new {
141
+ CSV.foreach(path) do |row|
142
+ name = row[1]
143
+ domain = Apollo::Model::Domain.where({:name => name}).first()
144
+ if(domain.nil?)
145
+ domain = Apollo::Model::Domain.new({:name => name})
146
+ domain.save
147
+ print "."
148
+ end
149
+ end
150
+ }
151
+ end
152
+
123
153
  def init_program(args)
124
154
  res = super(args)
125
155
  return res unless res.nil?
@@ -148,6 +178,8 @@ module Apollo
148
178
  res = super(args)
149
179
  return res unless res.nil?
150
180
 
181
+ init_domains()
182
+
151
183
  # Here we start
152
184
  # if(ARGV.length < 1)
153
185
  # puts optparser
@@ -156,7 +188,7 @@ module Apollo
156
188
 
157
189
  res_code = 0
158
190
  if(self.options[:daemon])
159
- planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
191
+ planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo, self.options)
160
192
  res_code = planner.run(self.options)
161
193
  end
162
194
 
@@ -0,0 +1,39 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), "../model/models")
22
+
23
+ module Apollo
24
+ module Scheduler
25
+ class BaseScheduler
26
+ def self.schedule(url, crawler=nil, opts={})
27
+ queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
28
+
29
+ if(queued_url.nil?)
30
+ qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
31
+ qu.save
32
+ return qu
33
+ end
34
+
35
+ return nil
36
+ end
37
+ end # class BaseScheduler
38
+ end # module Scheduler
39
+ end # module Apollo
@@ -0,0 +1,21 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_scheduler')
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.24'
22
+ VERSION = '0.1.25'
23
23
  end # Apollo
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-23 00:00:00.000000000 Z
11
+ date: 2013-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -414,6 +414,9 @@ files:
414
414
  - ./lib/apollo_crawler/adapter/mongo_adapter.rb
415
415
  - ./lib/apollo_crawler/agent/agents.rb
416
416
  - ./lib/apollo_crawler/agent/base_agent.rb
417
+ - ./lib/apollo_crawler/agent/crawler_agent.rb
418
+ - ./lib/apollo_crawler/agent/domainer_agent.rb
419
+ - ./lib/apollo_crawler/agent/exchanges.rb
417
420
  - ./lib/apollo_crawler/agent/fetcher_agent.rb
418
421
  - ./lib/apollo_crawler/cache/base_cache.rb
419
422
  - ./lib/apollo_crawler/cache/caches.rb
@@ -452,6 +455,8 @@ files:
452
455
  - ./lib/apollo_crawler/logger/loggers.rb
453
456
  - ./lib/apollo_crawler/model/base_model.rb
454
457
  - ./lib/apollo_crawler/model/crawler.rb
458
+ - ./lib/apollo_crawler/model/data_source.rb
459
+ - ./lib/apollo_crawler/model/domain.rb
455
460
  - ./lib/apollo_crawler/model/models.rb
456
461
  - ./lib/apollo_crawler/model/queued_url.rb
457
462
  - ./lib/apollo_crawler/model/raw_document.rb
@@ -463,6 +468,8 @@ files:
463
468
  - ./lib/apollo_crawler/program/crawler_program.rb
464
469
  - ./lib/apollo_crawler/program/platform_program.rb
465
470
  - ./lib/apollo_crawler/program/programs.rb
471
+ - ./lib/apollo_crawler/scheduler/base_scheduler.rb
472
+ - ./lib/apollo_crawler/scheduler/schedulers.rb
466
473
  - ./lib/apollo_crawler/store/base_store.rb
467
474
  - ./lib/apollo_crawler/store/stores.rb
468
475
  - ./lib/apollo_crawler/version.rb