apollo-crawler 0.1.24 → 0.1.25

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71fb379b6ae32ceb79e40cce451c8a3646278d32
4
- data.tar.gz: 08fdec629298945a86993b91be3a743b880ad4a0
3
+ metadata.gz: 79e9ecdfed577a1ce13b74b24d6d5bc26bf75843
4
+ data.tar.gz: 6d93c6da6316d4666ddc5e434bab1caadc213ba3
5
5
  SHA512:
6
- metadata.gz: d0789d2ef99358144c90d148c378d9bf53e3084b326ede425ed7ff171ab450a4ed9de7a86494dbf49992ed235807f92e6a795cc52676bd61280a006408fc4e90
7
- data.tar.gz: 16ac99fc7e192fe137348c6d364e730c9c0071f274f200b395746c4247bb3958c7238b315b571a730014b5f8286602bbc67eb9f26db30c217440e15401d3ac95
6
+ metadata.gz: 863d10a255722bd53c9ee998e2886fd86d04cf7808284323d33f7da1fe77fc99ac0874f1e2a61f20c8656e039bc3bac2d2a9cef911e9b6e6d12266e91636b3bc
7
+ data.tar.gz: db77a2d4606dcecbec1ae2e0d872cc41f6fea64012280e179b1147d8bbabfe7eef5446bf364978ee930f338cb367624e30a86e11fe0613e4499e03dcbc670e4b
@@ -57,5 +57,8 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
57
57
  # Program
58
58
  require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
59
59
 
60
+ # Scheduler
61
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/scheduler/schedulers')
62
+
60
63
  # Stores
61
64
  require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
@@ -18,5 +18,8 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
21
22
  require File.join(File.dirname(__FILE__), 'base_agent')
23
+ require File.join(File.dirname(__FILE__), 'domainer_agent')
24
+ require File.join(File.dirname(__FILE__), 'crawler_agent')
22
25
  require File.join(File.dirname(__FILE__), 'fetcher_agent')
@@ -18,6 +18,8 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+
21
23
  module Apollo
22
24
  module Agent
23
25
  class BaseAgent
@@ -0,0 +1,77 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+ require File.join(File.dirname(__FILE__), 'base_agent')
23
+
24
+ require File.join(File.dirname(__FILE__), '../crawler/crawlers')
25
+
26
+ require 'nokogiri'
27
+
28
+ module Apollo
29
+ module Agent
30
+ class CrawlerAgent < BaseAgent
31
+ attr_accessor :declarations
32
+
33
+ def initialize(amqp, opts={})
34
+ if(opts[:verbose])
35
+ puts "Initializing crawler agent..."
36
+ end
37
+
38
+ # Declarations
39
+ channel = amqp.create_channel
40
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
+
42
+ # Binding
43
+ declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
44
+ puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
45
+
46
+ msg = JSON.parse(payload)
47
+
48
+ request = msg['request']
49
+ response = msg['response']
50
+
51
+ # puts "PLANEEEER: #{msg.inspect}"
52
+
53
+ doc = Nokogiri::HTML(response['body'])
54
+ crawler = request['crawler_name'].constantize.new
55
+ data = crawler.extract_data(doc)
56
+ links = crawler.extract_links(doc)
57
+
58
+ # puts crawler.to_s
59
+ # puts res.inspect
60
+
61
+ if(metadata[:reply_to] != nil)
62
+ x = declarations[:exchanges][metadata[:reply_to]]
63
+
64
+ msg = {
65
+ :request => request,
66
+ :response => response,
67
+ :data => data,
68
+ :links => links
69
+ }
70
+
71
+ x.publish(msg.to_json)
72
+ end
73
+ end
74
+ end
75
+ end # class CrawlerAgent
76
+ end # module Agent
77
+ end # module Apollo
@@ -0,0 +1,51 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
22
+ require File.join(File.dirname(__FILE__), 'base_agent')
23
+
24
+ require File.join(File.dirname(__FILE__), '../crawler/crawlers')
25
+
26
+ require 'nokogiri'
27
+
28
+ module Apollo
29
+ module Agent
30
+ class DomainerAgent < BaseAgent
31
+ attr_accessor :declarations
32
+
33
+ def initialize(amqp, opts={})
34
+ if(opts[:verbose])
35
+ puts "Initializing crawler agent..."
36
+ end
37
+
38
+ # Declarations
39
+ channel = amqp.create_channel
40
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
+
42
+ # Binding
43
+ declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
44
+ puts "DomainerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
45
+
46
+ msg = JSON.parse(payload)
47
+ end
48
+ end
49
+ end # class DomainerAgent
50
+ end # module Agent
51
+ end # module Apollo
@@ -0,0 +1,55 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ module Apollo
22
+ module Agent
23
+ def self.declare_exchanges(channel, opts={})
24
+ if(opts[:verbose])
25
+ puts "Declaring AMQP Exchanges"
26
+ end
27
+
28
+ # Exchanges
29
+ exchanges = {}
30
+ exchanges["crawler"] = channel.direct("crawler", :auto_delete => false, :durable => true)
31
+ exchanges["domainer"] = channel.direct("domainer", :auto_delete => false, :durable => true)
32
+ exchanges["fetcher"] = channel.direct("fetcher", :auto_delete => false, :durable => true)
33
+ exchanges["planner.crawled"] = channel.direct("planner.crawled", :auto_delete => false, :durable => true)
34
+ exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
35
+ exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
36
+
37
+ # Queues
38
+ queues = {}
39
+ queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
40
+ queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
41
+ queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
42
+ queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
43
+ queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
44
+ queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
45
+
46
+ # Compose res
47
+ res = {
48
+ :exchanges => exchanges,
49
+ :queues => queues
50
+ }
51
+
52
+ return res
53
+ end
54
+ end
55
+ end
@@ -18,13 +18,22 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require File.join(File.dirname(__FILE__), 'exchanges')
21
22
  require File.join(File.dirname(__FILE__), 'base_agent')
23
+
22
24
  require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
23
25
 
26
+ require 'digest/sha1'
27
+ require 'thread/pool'
28
+
24
29
  module Apollo
25
30
  module Agent
26
31
  class FetcherAgent < BaseAgent
32
+ THREAD_POOL_SIZE = 10
33
+
27
34
  attr_accessor :fetcher
35
+ attr_accessor :declarations
36
+ attr_accessor :thread_pool
28
37
 
29
38
  def initialize(amqp, opts={})
30
39
  self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -33,23 +42,64 @@ module Apollo
33
42
  puts "Initializing fetcher agent..."
34
43
  end
35
44
 
36
- ch = amqp.create_channel
37
- q = ch.queue("fetcher", :auto_delete => false, :durable => true)
38
- x = ch.default_exchange
45
+ thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
39
46
 
40
- q.subscribe do |delivery_info, metadata, payload|
41
- res = nil
47
+ # Declarations
48
+ channel = amqp.create_channel
49
+ declarations = Apollo::Agent.declare_exchanges(channel, opts)
42
50
 
43
- puts "Received #{payload}" if opts[:verbose]
44
-
45
- Thread.new do |t|
51
+ # Binding
52
+ declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
53
+ thread_pool.process {
46
54
  queued_url = JSON.parse(payload)
47
- # puts queued_url["url"]
48
- # res = Apollo::Fetcher::SmartFetcher::fetch(queued_url["url"])
49
- # puts "#{queued_url['url']} - " + res.inspect
50
- end
55
+ url = queued_url["url"]
56
+
57
+ puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
58
+
59
+ doc = nil
60
+ begin
61
+ doc = Apollo::Fetcher::SmartFetcher::fetch(url)
62
+ rescue Exception => e
63
+ puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
64
+ end
65
+
66
+ doc = get_fetched_doc(queued_url, doc, metadata, opts)
67
+
68
+ # send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
69
+
70
+ if(metadata && metadata[:reply_to])
71
+ exchange_name = metadata[:reply_to]
72
+
73
+ if(exchange_name != nil)
74
+ msg = get_response_msg(queued_url, doc)
75
+
76
+ x = declarations[:exchanges][exchange_name]
77
+ x.publish(msg.to_json)
78
+ end
79
+ end
80
+ }
51
81
  end
52
82
  end
83
+
84
+ def get_fetched_doc(queued_url, doc, metadata, opts={})
85
+ url = queued_url["url"]
86
+
87
+ res = Apollo::Model::RawDocument.new
88
+ res.headers = doc.headers
89
+ res.body = doc.body
90
+ res.sha_hash = Digest::SHA1.hexdigest(doc.body)
91
+ res.status = doc.status
92
+ res.url = url
93
+
94
+ return res
95
+ end
96
+
97
+ def get_response_msg(queued_url, doc)
98
+ return {
99
+ :request => queued_url,
100
+ :response => doc
101
+ }
102
+ end
53
103
  end # class FetcherAgent
54
104
  end # module Agent
55
105
  end # module Apollo
@@ -172,8 +172,8 @@ module Apollo
172
172
  def self.create_metadoc(url, doc)
173
173
  return {
174
174
  'url' => url,
175
- 'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
- 'hash' => Digest::SHA256.new.update(doc).hexdigest,
175
+ 'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
+ 'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
177
177
  'created_at' => Time.now.utc,
178
178
  'expires_at' => nil,
179
179
  'version' => 0
@@ -46,14 +46,7 @@ module Apollo
46
46
  end
47
47
 
48
48
  def extract_links(doc)
49
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
50
- url = BaseCrawler.try_get_url(self.url, node['href']).to_s
51
- next if url.nil?
52
-
53
- {
54
- :link => url
55
- }
56
- }
49
+ return []
57
50
  end
58
51
  end # class SlashdotCrawler
59
52
  end # module Crawler
@@ -24,6 +24,9 @@ require "open-uri"
24
24
  require 'faraday'
25
25
  require 'ipaddr'
26
26
 
27
+ # require 'resolv'
28
+ # require 'resolv-replace'
29
+
27
30
  module Apollo
28
31
  module Fetcher
29
32
  class BaseFetcher
@@ -36,7 +39,12 @@ module Apollo
36
39
  end
37
40
 
38
41
  def self.fetch(url, options = {})
39
- uri = URI.parse(url.to_s)
42
+ begin
43
+ uri = URI.parse(url.to_s)
44
+ rescue Exception => e
45
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
46
+ return nil
47
+ end
40
48
 
41
49
  # See https://github.com/lostisland/faraday
42
50
  conn = Faraday.new(:url => url) do |faraday|
@@ -46,12 +54,17 @@ module Apollo
46
54
  end
47
55
 
48
56
  # Make request
49
- res = conn.get(uri) do |request|
50
- request.headers = BaseFetcher.get_fake_headers(uri)
57
+ begin
58
+ res = conn.get(uri) do |request|
59
+ request.headers = BaseFetcher.get_fake_headers(uri)
60
+ end
61
+ rescue Exception => e
62
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
63
+ return nil
51
64
  end
52
65
 
53
66
  # Return result
54
- return res.body
67
+ return res
55
68
  end
56
69
  end # class BaseFetcher
57
70
  end # module Fetcher
@@ -54,5 +54,8 @@ require File.join(File.dirname(__FILE__), 'planner/planners')
54
54
  # Programs
55
55
  require File.join(File.dirname(__FILE__), 'program/programs')
56
56
 
57
+ # Programs
58
+ require File.join(File.dirname(__FILE__), 'scheduler/schedulers')
59
+
57
60
  # Stores
58
61
  require File.join(File.dirname(__FILE__), 'store/stores')
@@ -0,0 +1,37 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_model')
22
+
23
+ module Apollo
24
+ module Model
25
+ class DataSource < BaseModel
26
+ include Mongoid::Document
27
+ include Mongoid::Timestamps
28
+
29
+ store_in collection: "data_sources"
30
+
31
+ field :url
32
+
33
+ # Indexes
34
+ index({ created_at: 1, updated_at: 1, url: 1 })
35
+ end # class DataSource
36
+ end # module Model
37
+ end # module Apollo
@@ -0,0 +1,37 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_model')
22
+
23
+ module Apollo
24
+ module Model
25
+ class Domain < BaseModel
26
+ include Mongoid::Document
27
+ include Mongoid::Timestamps
28
+
29
+ store_in collection: "domains"
30
+
31
+ field :name
32
+
33
+ # Indexes
34
+ index({ created_at: 1, updated_at: 1, name: 1 })
35
+ end # class Domain
36
+ end # module Model
37
+ end # module Apollo
@@ -20,5 +20,7 @@
20
20
 
21
21
  require File.join(File.dirname(__FILE__), 'base_model')
22
22
  require File.join(File.dirname(__FILE__), 'crawler')
23
+ require File.join(File.dirname(__FILE__), 'data_source')
24
+ require File.join(File.dirname(__FILE__), 'domain')
23
25
  require File.join(File.dirname(__FILE__), 'queued_url')
24
- require File.join(File.dirname(__FILE__), 'raw_document')
26
+ require File.join(File.dirname(__FILE__), 'raw_document')
@@ -30,9 +30,10 @@ module Apollo
30
30
 
31
31
  field :url
32
32
  field :state
33
+ field :crawler_name
33
34
 
34
35
  # Indexes
35
- index({ created_at: 1, updated_at: 1 })
36
+ index({ created_at: 1, updated_at: 1, crawler_name: 1 })
36
37
  end # class QueuedUrl
37
38
  end # module Model
38
39
  end # module Apollo
@@ -28,10 +28,14 @@ module Apollo
28
28
 
29
29
  store_in collection: "raw_docs"
30
30
 
31
+ field :url
32
+ field :headers
31
33
  field :body
34
+ field :status
35
+ field :sha_hash
32
36
 
33
37
  # Indexes
34
- index({ created_at: 1, updated_at: 1 })
38
+ index({ created_at: 1, updated_at: 1, sha_hash: 1 })
35
39
  end # class RawDocument
36
40
  end # module Model
37
41
  end # module Apollo
@@ -22,43 +22,101 @@ require File.join(File.dirname(__FILE__),'base_planner')
22
22
 
23
23
  require File.join(File.dirname(__FILE__),'../model/models.rb')
24
24
 
25
+ require File.join(File.dirname(__FILE__),'../agent/exchanges.rb')
26
+
27
+ require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
28
+
29
+ require 'nokogiri'
30
+
25
31
  module Apollo
26
32
  module Planner
27
33
  class SmartPlanner < BasePlanner
28
34
  attr_accessor :amqp
29
35
  attr_accessor :mongo
36
+ attr_accessor :declarations
30
37
 
31
- def initialize(amqp=nil, mongo=nil)
38
+ def initialize(amqp=nil, mongo=nil, opts={})
32
39
  self.amqp = amqp
33
40
  self.mongo = mongo
34
- end
35
41
 
36
- def fetch_url(url, opts={})
37
- puts "AMQP fetching '#{url.inspect}'"
42
+ # Declarations
43
+ channel = amqp.create_channel
44
+ self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
38
45
 
39
- ch = amqp.create_channel
40
- x = ch.default_exchange
41
- x.publish(url.to_json, :routing_key => "fetcher")
46
+ # Bindings
47
+ declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
48
+ msg = JSON.parse(payload)
42
49
 
43
- end
50
+ request = msg['request']
51
+ response = msg['response']
44
52
 
45
- def fetch_queued_urls(opts={})
46
- urls = Apollo::Model::QueuedUrl.where({:state => :queued})
47
- return if urls.count < 1
53
+ doc = Apollo::Model::QueuedUrl.find(request["_id"])
54
+ doc.update_attributes(msg['request'])
55
+ doc.state = "fetched"
56
+ doc.save
57
+
58
+ doc = Apollo::Model::RawDocument.where(:url => request['url']).first
59
+ if doc
60
+ if doc.sha_hash != response['sha_hash']
61
+ puts "Removing old cached version of '#{request['url']}'" if opts[:verbose]
62
+
63
+ doc.destroy
64
+ doc = nil
65
+ else
66
+ puts "Using cached version of '#{request['url']}'" if opts[:verbose]
67
+ end
68
+ else
69
+ doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
70
+ end
71
+
72
+ if(doc.nil?)
73
+ doc = Apollo::Model::RawDocument.new(response).save
74
+ end
75
+
76
+ # Publish
77
+ declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
78
+ end
79
+
80
+ declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
81
+ msg = JSON.parse(payload)
48
82
 
83
+ puts "DOMAINED !!!"
84
+ end
85
+
86
+ declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
87
+ msg = JSON.parse(payload)
88
+
89
+ request = msg['request']
90
+ response = msg['response']
91
+ data = msg['data']
92
+ links = msg['links']
93
+ links = [] if links.nil?
94
+
95
+ links.each do |url|
96
+ link = url['link']
97
+
98
+ Apollo::Scheduler::BaseScheduler::schedule(link, request['crawler_name'])
99
+ end
100
+
101
+ # puts JSON.pretty_generate(data)
102
+ # puts JSON.pretty_generate(links)
103
+ end
104
+ end
105
+
106
+ def fetch_url(url, opts={})
49
107
  if(opts[:verbose])
50
- puts "Fetching Queued URLS"
108
+ puts "AMQP fetching '#{url.inspect}'"
51
109
  end
52
110
 
53
- puts "Count of URLs in Queue: #{urls.count}" if opts[:verbose]
111
+ # Publish
112
+ declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
113
+ end
54
114
 
55
- urls.each do |url|
56
- url.state = :fetching
57
- url.save
115
+ def fetch_queued_urls(opts={})
116
+ while url = Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
117
+ # puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
58
118
 
59
119
  fetch_url(url, opts)
60
-
61
- # puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
62
120
  end
63
121
  end
64
122
 
@@ -37,6 +37,7 @@ require 'eventmachine'
37
37
  require 'em-http'
38
38
 
39
39
  require 'fileutils'
40
+ require 'csv'
40
41
 
41
42
  require 'mongoid'
42
43
 
@@ -44,6 +45,8 @@ require File.join(File.dirname(__FILE__), '..', 'version')
44
45
 
45
46
  require File.join(File.dirname(__FILE__),'base_program')
46
47
 
48
+ require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
49
+
47
50
  module Apollo
48
51
  # Apollo Crawler Base Directory
49
52
  APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
@@ -97,29 +100,56 @@ module Apollo
97
100
  def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
98
101
  crawlers.each do |crawler|
99
102
  i = crawler.new
100
- puts "Queuying Crawler base URL: '#{i.url}'" if opts[:verbose]
101
- qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
102
- qu.save
103
+ Apollo::Scheduler::BaseScheduler::schedule(i.url, crawler)
103
104
  end
104
105
  end
105
106
 
107
+ def init_crawlers(amqp, opts={})
108
+ crawlers = []
109
+ crawlers << Apollo::Agent::CrawlerAgent.new(amqp, self.options)
110
+ end
111
+
112
+ def init_domainers(amqp, opts={})
113
+ domainers = []
114
+ domainers << Apollo::Agent::DomainerAgent.new(amqp, self.options)
115
+ end
116
+
106
117
  def init_fetchers(amqp, opts={})
107
118
  fetchers = []
108
119
  fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
109
120
 
121
+ # TODO: This should not be here!
110
122
  enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
111
-
112
- # ch = self.amqp.create_channel
113
- # x = ch.default_exchange
114
- # x.publish("Hello!", :routing_key => "fetcher")
115
123
  end
116
124
 
117
125
  def init_agents(amqp, opts={})
118
126
  puts "Initializing agents"
119
127
 
128
+ init_crawlers(amqp, opts)
129
+ init_domainers(amqp, opts)
120
130
  init_fetchers(amqp, opts)
121
131
  end
122
132
 
133
+ def init_domains(opts={})
134
+ path = File.join(File.dirname(__FILE__), "../../../tmp/top-1m.csv")
135
+ puts "#{path}"
136
+ if(File.exists?(path) == false)
137
+ return 0
138
+ end
139
+
140
+ Thread::new {
141
+ CSV.foreach(path) do |row|
142
+ name = row[1]
143
+ domain = Apollo::Model::Domain.where({:name => name}).first()
144
+ if(domain.nil?)
145
+ domain = Apollo::Model::Domain.new({:name => name})
146
+ domain.save
147
+ print "."
148
+ end
149
+ end
150
+ }
151
+ end
152
+
123
153
  def init_program(args)
124
154
  res = super(args)
125
155
  return res unless res.nil?
@@ -148,6 +178,8 @@ module Apollo
148
178
  res = super(args)
149
179
  return res unless res.nil?
150
180
 
181
+ init_domains()
182
+
151
183
  # Here we start
152
184
  # if(ARGV.length < 1)
153
185
  # puts optparser
@@ -156,7 +188,7 @@ module Apollo
156
188
 
157
189
  res_code = 0
158
190
  if(self.options[:daemon])
159
- planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
191
+ planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo, self.options)
160
192
  res_code = planner.run(self.options)
161
193
  end
162
194
 
@@ -0,0 +1,39 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), "../model/models")
22
+
23
+ module Apollo
24
+ module Scheduler
25
+ class BaseScheduler
26
+ def self.schedule(url, crawler=nil, opts={})
27
+ queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
28
+
29
+ if(queued_url.nil?)
30
+ qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
31
+ qu.save
32
+ return qu
33
+ end
34
+
35
+ return nil
36
+ end
37
+ end # class BaseScheduler
38
+ end # module Scheduler
39
+ end # module Apollo
@@ -0,0 +1,21 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_scheduler')
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.24'
22
+ VERSION = '0.1.25'
23
23
  end # Apollo
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-23 00:00:00.000000000 Z
11
+ date: 2013-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -414,6 +414,9 @@ files:
414
414
  - ./lib/apollo_crawler/adapter/mongo_adapter.rb
415
415
  - ./lib/apollo_crawler/agent/agents.rb
416
416
  - ./lib/apollo_crawler/agent/base_agent.rb
417
+ - ./lib/apollo_crawler/agent/crawler_agent.rb
418
+ - ./lib/apollo_crawler/agent/domainer_agent.rb
419
+ - ./lib/apollo_crawler/agent/exchanges.rb
417
420
  - ./lib/apollo_crawler/agent/fetcher_agent.rb
418
421
  - ./lib/apollo_crawler/cache/base_cache.rb
419
422
  - ./lib/apollo_crawler/cache/caches.rb
@@ -452,6 +455,8 @@ files:
452
455
  - ./lib/apollo_crawler/logger/loggers.rb
453
456
  - ./lib/apollo_crawler/model/base_model.rb
454
457
  - ./lib/apollo_crawler/model/crawler.rb
458
+ - ./lib/apollo_crawler/model/data_source.rb
459
+ - ./lib/apollo_crawler/model/domain.rb
455
460
  - ./lib/apollo_crawler/model/models.rb
456
461
  - ./lib/apollo_crawler/model/queued_url.rb
457
462
  - ./lib/apollo_crawler/model/raw_document.rb
@@ -463,6 +468,8 @@ files:
463
468
  - ./lib/apollo_crawler/program/crawler_program.rb
464
469
  - ./lib/apollo_crawler/program/platform_program.rb
465
470
  - ./lib/apollo_crawler/program/programs.rb
471
+ - ./lib/apollo_crawler/scheduler/base_scheduler.rb
472
+ - ./lib/apollo_crawler/scheduler/schedulers.rb
466
473
  - ./lib/apollo_crawler/store/base_store.rb
467
474
  - ./lib/apollo_crawler/store/stores.rb
468
475
  - ./lib/apollo_crawler/version.rb